vignette updates

mustberuss · mustberuss · commit 9f77084863b5 · 2025-02-15T17:29:02.000-06:00
diff --git a/vignettes/citation-networks.Rmd.orig b/vignettes/citation-networks.Rmd.orig
@@ -9,16 +9,15 @@ vignette: >
 
 ```{r, include = FALSE}
 knitr::opts_chunk$set(
-  collapse = TRUE,
-  comment = "#>",
+  collapse = TRUE, 
+  comment = "#>", 
   warning = FALSE,
   message = FALSE,
-  dev = "png",
   fig.cap = ""
 )
 ```
 > Important API Change
-> 
+>
 > The new version of the API requires an API key, or all of your requests will be blocked.  See the [API Changes](api-changes.html#an-api-key-is-required-1) page.
 
 
@@ -33,7 +32,6 @@ library(visNetwork)
 library(magrittr)
 library(stringr)
 library(knitr)
-library(webshot2)
 
 # Write a query to pull patents assigned to the CPC code of "Y10S707/933"
 query <- qry_funs$contains(cpc_current.cpc_group_id = "Y10S707/933")
@@ -89,17 +87,17 @@ pat_title <- function(title, number) {
 
 edges <-
   res_lst$us_patent_citations %>%
-  semi_join(x = ., y = ., by = c("citation_patent_id" = "patent_id")) %>%
-  select(-citation_sequence) %>%  # discard citation_sequence that we don't need here
-  set_colnames(c("from", "to"))
+    semi_join(x = ., y = ., by = c("citation_patent_id" = "patent_id")) %>%
+    select(-citation_sequence) %>%  # discard citation_sequence that we don't need here
+    set_colnames(c("from", "to"))
 
 nodes <-
   pat_lst$patents %>%
-  mutate(
-    id = patent_id,
-    label = patent_id,
-    title = pat_title(patent_title, patent_id)
-  )
+    mutate(
+      id = patent_id,
+      label = patent_id,
+      title = pat_title(patent_title, patent_id)
+    )
 
 visNetwork(
   nodes = nodes, edges = edges, height = "400px", width = "100%",
@@ -121,14 +119,14 @@ With only 3 patents, it will probably be possible to visualize how these patents
 ```{r}
 rel_pats <-
   res_lst$us_patent_citations %>%
-  rbind(setNames(res_lst$us_patent_citations, names(.))) %>%
-  select(-patent_id) %>%
-  rename(patent_id = citation_patent_id) %>%
-  bind_rows(data.frame(patent_id = p3)) %>%
-  distinct() %>%
-  filter(!is.na(patent_id))
-
-# Look up which patents the relevant patents cite.  We need to use the 
+    rbind(setNames(res_lst$us_patent_citations, names(.))) %>%
+    select(-patent_id) %>%
+    rename(patent_id = citation_patent_id) %>%
+    bind_rows(data.frame(patent_id = p3)) %>%
+    distinct() %>%
+    filter(!is.na(patent_id))
+
+# Look up which patents the relevant patents cite.  We need to use the
 # patent_citation endpoint now.
 
 rel_pats_res <- search_pv(
@@ -150,20 +148,20 @@ cited_pats <-
 
 full_network <-
   cited_pats %>%
-  do({
-    .$ind <-
+    do({
+      .$ind <-
+        group_by(., patent_id) %>%
+        group_indices()
       group_by(., patent_id) %>%
-      group_indices()
-    group_by(., patent_id) %>%
-      mutate(sqrt_num_cited = sqrt(n()))
-  }) %>%
-  inner_join(x = ., y = ., by = "citation_patent_id") %>%
-  filter(ind.x > ind.y) %>%
-  group_by(patent_id.x, patent_id.y) %>%
-  mutate(cosine_sim = n() / (sqrt_num_cited.x * sqrt_num_cited.y)) %>%
-  ungroup() %>%
-  select(matches("patent_id\\.|cosine_sim")) %>%
-  distinct()
+        mutate(sqrt_num_cited = sqrt(n()))
+    }) %>%
+    inner_join(x = ., y = ., by = "citation_patent_id") %>%
+    filter(ind.x > ind.y) %>%
+    group_by(patent_id.x, patent_id.y) %>%
+    mutate(cosine_sim = n() / (sqrt_num_cited.x * sqrt_num_cited.y)) %>%
+    ungroup() %>%
+    select(matches("patent_id\\.|cosine_sim")) %>%
+    distinct()
 
 kable(head(full_network))
 ```
@@ -183,21 +181,21 @@ There appears to be a smallish group of patent pairs that are very similar to on
 ```{r citationvis2}
 edges <-
   full_network %>%
-  filter(cosine_sim >= .1) %>%
-  rename(from = patent_id.x, to = patent_id.y, value = cosine_sim) %>%
-  mutate(title = paste("Cosine similarity =", as.character(round(value, 3))))
+    filter(cosine_sim >= .1) %>%
+    rename(from = patent_id.x, to = patent_id.y, value = cosine_sim) %>%
+    mutate(title = paste("Cosine similarity =", as.character(round(value, 3))))
 
 nodes <-
   rel_pats_lst$us_patent_citations %>%
-  distinct(patent_id) %>%
-  rename(id = patent_id) %>%
-  mutate(
-    # the 3 patents of interest will be represented as blue nodes, all others
-    # will be yellow
-    color = ifelse(id %in% p3, "#97C2FC", "#DDCC77"),
-    label = id,
-    title = pat_title(id, id) # we don't get patent_title now (formerly first argument)
-  )
+    distinct(patent_id) %>%
+    rename(id = patent_id) %>%
+    mutate(
+      # the 3 patents of interest will be represented as blue nodes, all others
+      # will be yellow
+      color = ifelse(id %in% p3, "#97C2FC", "#DDCC77"),
+      label = id,
+      title = pat_title(id, id) # we don't get patent_title now (formerly first argument)
+    )
 
 visNetwork(
   nodes = nodes, edges = edges, height = "700px", width = "100%",
diff --git a/vignettes/examples.Rmd.orig b/vignettes/examples.Rmd.orig
@@ -14,7 +14,7 @@ knitr::opts_chunk$set(
 )
 ```
 
-With the recent [API changes](api-changes.html), the patent endpoint is the main way to retrieve data. The other 
+With the recent [API changes](api-changes.html), the patent endpoint is the main way to retrieve data. The other
 endpoints supply additional information.  Also note that an API key is required.
 
 ## Patent endpoint
@@ -24,7 +24,9 @@ Which patents have been cited by more than 500 US patents?
 ```{r}
 library(patentsview)
 
-search_pv(query = qry_funs$gt(patent_num_times_cited_by_us_patents = 500))
+fields = c("patent_id", "patent_title", "patent_date")
+search_pv(query = qry_funs$gt(patent_num_times_cited_by_us_patents = 500),
+  fields = fields)
 ```
 
 How many distinct inventors are represented by these highly-cited patents?
@@ -33,7 +35,7 @@ How many distinct inventors are represented by these highly-cited patents?
 
 search_pv(
   query = qry_funs$gt(patent_num_times_cited_by_us_patents = 500),
-  fields = c("patent_id", "inventors.inventor_id")
+  fields = c("patent_id", "inventors")
 )
 ```
 
@@ -84,7 +86,8 @@ Which inventor's most recent patent has Chicago, IL listed as their location.
 pv_out <- search_pv(
   query = '{"_and":[{"_text_phrase": {"inventor_lastknown_city":"Chicago"}},
                     {"_text_phrase": {"inventor_lastknown_state":"IL"}}]}',
-  endpoint = "inventor"
+  endpoint = "inventor",
+  fields = c("inventor_id", "inventor_name_first", "inventor_name_last")
 )
 
 pv_out
@@ -119,7 +122,7 @@ What assignee's organizations start Microsoft?
 
 ```{r}
 query <- qry_funs$begins(assignee_organization = "Microsoft")
-fields <- c("assignee_id", "assignee_organization")
+fields <- c("assignees")
 pv_out <- search_pv(query, fields = fields, endpoint = "assignee")
 pv_out$data$assignees$assignee_organization
 ```
diff --git a/vignettes/getting-started.Rmd.orig b/vignettes/getting-started.Rmd.orig
@@ -28,7 +28,8 @@ library(patentsview)
 
 search_pv(
   query = '{"_gte":{"patent_date":"2007-01-01"}}',
-  endpoint = "patent"
+  endpoint = "patent",
+  fields = c("patent_id", "patent_title", "patent_date")
 )
 ```
 
@@ -66,10 +67,11 @@ Check out the [writing queries vignette](writing-queries.html) for more details
 
 ## Fields
 
-Each endpoint has a different set of fields. The new version of the API allows all fields to be queried. In the first example, we didn't specify which fields we wanted to retrieve so we were given the default set. You can specify which fields you want using the `fields` argument:
+Each endpoint has a different set of fields. The new version of the API allows all fields to be queried. You can specify which fields you want using the `fields` argument.  If you don't specify any, you will get
+the primary key(s) for the specified endpoint.
 
 ```{r}
-# search_pv defaults the endpoint parameter to "patents" if not specified
+# search_pv defaults the endpoint parameter to "patent" if not specified
 result = search_pv(
   query = '{"_gte":{"patent_date":"2007-01-01"}}',
   fields = c("patent_id", "patent_title")
@@ -153,41 +155,6 @@ Your choice of endpoint determines two things:
 
 Most of the time you will want to use the patent endpoint. Note that you can still effectively filter on fields that are not at the patent-level when using the patent endpoint (e.g., you can filter on assignee name or CPC category). This is because patents are relatively low-level entities. For higher level entities like assignees, if you filter on a field that is not at the assignee-level (e.g., inventor name), the API will return data on any assignee that has at least one inventor whose name matches your search, which is probably not what you want.
 
-## Casting fields
-
-The API now returns most data fields as an appropriate data type (e.g., numeric). There are, however,
-a few fields that are returned as strings rather than integers and there are 33 date fields now.
-There are also two rule_47_flags, one returned by the patent endpoint, the other by the publication endpoint.
-The former is a boolean while the latter is a string that can be cast to a boolean.  Lastly, most
-of the document numbers are numeric though there are two instances where they are strings that can
-be cast to integers.  You can cast all fields to their preferred R types using `cast_pv_data()`:
-
-```{r}
-library(knitr)
-
-# These fields are received as strings and would be cast to integers
-ints <- fieldsdf[fieldsdf$data_type == "int", c("endpoint","field")]
-ints <- ints[order(ints$field),]
-print(ints,  row.names = FALSE)
-
-# These fields are received as strings and would be cast to dates
-date_fields <- fieldsdf[fieldsdf$data_type == "date", c("endpoint","field")]
-kable(date_fields, row.names = FALSE)
-
-res <- search_pv(
-  query = '{"patent_id":"5116621"}', 
-  fields = c("patent_id", "patent_date", "patent_title", "patent_year", "assignees.assignee_type")
-)
-
-# unnest_pv_data shows that the assignee_type field is received as a string and patent_year
-# is received as an integer
-unnest_pv_data(res$data)
-
-# cast_pv_data will convert the string patent_date to an R date and assignee_type to an integer
-recast <- cast_pv_data(res$data)
-unnest_pv_data(recast)
-```
-
 ## FAQs
 
 #### I'm sure my query is well formatted and correct but I keep getting an error. What's the deal?
diff --git a/vignettes/top-assignees.Rmd.orig b/vignettes/top-assignees.Rmd.orig
@@ -66,8 +66,7 @@ query
 fields <- c(
   "patent_id", "patent_date", "patent_year", "patent_earliest_application_date",
   "patent_num_us_patents_cited", "application.filing_date",
-  "assignees.assignee_organization",
-  "assignees.assignee_id"  # the assignee fields come back in a nested object
+  "assignees"  # the assignee fields come back in a nested object
 )
 
 # Send an HTTP request to the PatentsView API to get the data

Original file line number	Diff line number	Diff line change
`@@ -66,8 +66,7 @@ query`
`66`	`66`	`fields <- c(`
`67`	`67`	`"patent_id", "patent_date", "patent_year", "patent_earliest_application_date",`
`68`	`68`	`"patent_num_us_patents_cited", "application.filing_date",`
`69`		`- "assignees.assignee_organization",`
`70`		`- "assignees.assignee_id" # the assignee fields come back in a nested object`
	`69`	`+ "assignees" # the assignee fields come back in a nested object`
`71`	`70`	`)`
`72`	`71`
`73`	`72`	`# Send an HTTP request to the PatentsView API to get the data`